diff options
104 files changed, 5087 insertions, 3089 deletions
@@ -10,6 +10,7 @@ onig-config libtool aclocal.m4 Makefile.in +.python-version *.o *.obj *.so @@ -35,7 +36,6 @@ m4/*.m4 /fuzzers # src/ -/src/CaseFolding.txt /src/unicode_fold?_key.gperf /src/unicode_unfold_key.gperf /src/UNICODE_PROPERTIES @@ -47,6 +47,8 @@ m4/*.m4 /test/testc /test/testcu /test/testp +/test/test_regset +/test/kofu-utf8.txt # sample/ /sample/crnl @@ -62,6 +64,15 @@ m4/*.m4 /sample/echo /sample/count /sample/bug_fix +/sample/regset /sample/log* /harnesses/utf16*.dict +/harnesses/*-libfuzzer +/harnesses/main-* +/harnesses/libfuzzer-onig +/harnesses/libfuzzer-onig-full +/harnesses/slow-unit-* +/harnesses/timeout-* +/harnesses/crash-* +/harnesses/oom-* diff --git a/CMakeLists.txt b/CMakeLists.txt index c59bfe3..bce888a 100644 --- a/CMakeLists.txt +++ b/CMakeLists.txt @@ -1,6 +1,6 @@ cmake_minimum_required(VERSION 3.1) project(oniguruma - VERSION 6.9.3 + VERSION 6.9.4 LANGUAGES C) set(PACKAGE onig) @@ -64,6 +64,9 @@ target_include_directories(onig PUBLIC $<BUILD_INTERFACE:${CMAKE_CURRENT_SOURCE_DIR}/src> $<BUILD_INTERFACE:${CMAKE_CURRENT_BINARY_DIR}>) +target_compile_definitions(onig PUBLIC + $<$<NOT:$<BOOL:${BUILD_SHARED_LIBS}>>:ONIG_STATIC>) + if(MSVC) target_compile_options(onig PRIVATE #/W4 @@ -75,7 +78,6 @@ if(MSVC) $<$<CONFIG:MinSizeRel>:/MT> $<$<CONFIG:RelWithDebgInfo>:/MTd> ) - target_compile_definitions(onig PUBLIC -DONIG_STATIC) endif() elseif(CMAKE_COMPILER_IS_GNUCC) target_compile_options(onig PRIVATE @@ -1,8 +1,33 @@ History +2019/11/29: Version 6.9.4 + +2019/11/22: Release Candidate 3 for Version 6.9.4 + +2019/11/20: fix a problem found by libFuzzer test +2019/11/14: Release Candidate 2 for Version 6.9.4 +2019/11/12: fix integer overflow by nested quantifier +2019/11/11: fix CVE-2019-19012: Integer overflow related to reg->dmax in search_in_range() +2019/11/07: fix CVE-2019-19203: heap-buffer-overflow in gb18030_mbc_enc_len() +2019/11/06: fix CVE-2019-19204: heap-buffer-overflow in fetch_interval_quantifier() +2019/11/06: add HAVE_INTTYPES_H into config.h.windows.in and config.h.win{32,64} +2019/11/06: add HAVE_STDINT_H into config.h.win{32,64} +2019/11/05: Release Candidate 1 for Version 6.9.4 +2019/10/31: Update Unicode Emoji version to 12.1 (Nothing data changed) +2019/10/29: implement USE_REPEAT_AND_EMPTY_CHECK_LOCAL_VAR configuration +2019/10/18: re-implement case fold conversion +2019/10/04: fix #156: Heap buffer overflow in match_at() with case-insensitive match +2019/09/30: NEW API: add onig_regset_replace() +2019/09/30: change Unicode VERSION value format +2019/09/20: NEW API: add regset functions +2019/09/20: add data ensure check before peek string value in OP_PUSH_IF_PEEK_NEXT +2019/09/20: fix loose code in encode-harness.c +2019/08/13: fix heap-buffer-overflow +2019/08/13: Add a macro to disable direct threading in the match engine (PR#149) + 2019/08/06: Version 6.9.3 (secirity fix release) -2019/07/30: add ONIG_SYN_ALLOW_INVALID_CODE_END_OF_RANGE +2019/07/30: add ONIG_SYN_ALLOW_INVALID_CODE_END_OF_RANGE_IN_CC 2019/07/29: add STK_PREC_READ_START/END stack type 2019/07/29: Fix #147: Stack Exhaustion Problem caused by some parsing functions 2019/07/11: add a dictionary file for libfuzzer diff --git a/Makefile.am b/Makefile.am index a0bbc7b..ac5e27f 100644 --- a/Makefile.am +++ b/Makefile.am @@ -14,7 +14,7 @@ EXTRA_DIST = oniguruma.pc.in HISTORY README_japanese README.md \ doc/SYNTAX.md doc/UNICODE_PROPERTIES \ src/Makefile.windows src/config.h.windows.in \ src/config.h.win32 src/config.h.win64 \ - windows/testc.c contributed/libfuzzer-onig.cpp contributed/makefile + windows/testc.c bin_SCRIPTS = onig-config @@ -39,9 +39,12 @@ pkgconfig_DATA = oniguruma.pc all-test: cd test; make test +archive: + git archive --format=tar --prefix=oniguruma/ HEAD | gzip > ../oniguruma-archive.tar.gz + sanitize: make clean - ./configure CC=clang CFLAGS="-O -g -fsanitize=address" + ./configure --enable-posix-api=no CC=clang CFLAGS="-O -g -fsanitize=address" LDFLAGS="-fsanitize=address" make make all-test @@ -27,25 +27,34 @@ Supported character encodings: * doc/SYNTAX.md: contributed by seanofw +Version 6.9.4 +------------- + +* NEW API: RegSet (set of regexes) +* Fixed CVE-2019-19012 +* Fixed CVE-2019-19203 (Does not affect UTF-8, UTF-16 and UTF-32 encodings) +* Fixed CVE-2019-19204 (Affects only PosixBasic, Emacs and Grep syntaxes) +* Fixed CVE-2019-19246 +* Fixed some problems (found by libFuzzer test) + + Version 6.9.3 (security fix release) ------------------------------------ * Fixed CVE-2019-13224 * Fixed CVE-2019-13225 -* Fixed many problems (found by libfuzzer programs) +* Fixed CVE-2019-16163 +* Fixed many problems (found by libFuzzer test) Version 6.9.2 (Reiwa) --------------------- * add doc/SYNTAX.md +* Direct threaded code (for GCC and Clang) * Update Unicode version 12.1.0 * NEW: Unicode Text Segment mode option (?y{g}) (?y{w}) (*original) - g: Extended Grapheme Cluster mode / w: Word mode - - (Unicode Standard Annex #29 [http://unicode.org/reports/tr29/]) - Version 6.9.1 ------------- @@ -118,7 +127,7 @@ Version 6.5.0 * NEW: \O (true anychar) * NEW: if-then-else (?(...)...\|...) * NEW: Backreference validity checker (?(xxx)) (*original) -* NEW: Absent repeater (?~absent) \[is equal to (?\~\|absent|\O*)] +* NEW: Absent repeater (?~absent) \[is equal to (?\~\|(?:absent)|\O*)] * NEW: Absent expression (?~|absent|expr) (*original) * NEW: Absent stopper (?~|absent) (*original) @@ -244,15 +253,18 @@ Sample Programs |File |Description | |:---------------------|:-----------------------------------------| +|sample/callout.c |example of callouts | +|sample/count.c |example of built-in callout *COUNT | +|sample/echo.c |example of user defined callouts of name | +|sample/encode.c |example of some encodings | +|sample/listcap.c |example of the capture history | +|sample/names.c |example of the named group callback | +|sample/posix.c |POSIX API sample | +|sample/regset.c |example of using RegSet API | +|sample/scan.c |example of using onig_scan() | |sample/simple.c |example of the minimum (Oniguruma API) | -|sample/names.c |example of the named group callback. | -|sample/encode.c |example of some encodings. | -|sample/listcap.c |example of the capture history. | -|sample/posix.c |POSIX API sample. | -|sample/scan.c |example of using onig_scan(). | -|sample/sql.c |example of the variable meta characters. | -|sample/user_property.c|example of user defined Unicode property. | -|sample/callout.c |example of callouts. | +|sample/sql.c |example of the variable meta characters | +|sample/user_property.c|example of user defined Unicode property | Test Programs diff --git a/build_harnesses.sh b/build_harnesses.sh deleted file mode 100755 index 54dc9ff..0000000 --- a/build_harnesses.sh +++ /dev/null @@ -1,31 +0,0 @@ -#!/bin/bash - -make clean -autoreconf -vfi - -# build the library with ASAN -#NO_LINK="-fsanitize=fuzzer-no-link" -NO_LINK="" -./configure CC=clang LD=clang CFLAGS="-g -fsanitize=address -fno-omit-frame-pointer $NO_LINK" LDFLAGS="-g -fsanitize=address -fno-omit-frame-pointer $NO_LINK" -make -j4 - -OUT=`pwd`/fuzzers -mkdir -p $OUT -LIBFUZZER_FLAGS="-fsanitize=fuzzer,address -fno-omit-frame-pointer" -#LIBS="src/.libs/libonig.a" -LIBS="src/.libs/libonig.a /usr/local/lib/libLLVMFuzzerMain.a" - -CFLAGS="-Isrc -g $LIBFUZZER_FLAGS" - -# Libfuzzer builds -clang++ contributed/libfuzzer-onig.cpp $LIBS $CFLAGS -o $OUT/libfuzzer-onig -clang harnesses/syntax-harness.c $LIBS $CFLAGS -o $OUT/syntax-libfuzzer -clang harnesses/encode-harness.c $LIBS $CFLAGS -o $OUT/encode-libfuzzer -clang harnesses/deluxe-encode-harness.c $LIBS $CFLAGS -o $OUT/deluxe-encode-libfuzzer - -clang -DUTF16_BE harnesses/encode-harness.c $LIBS $CFLAGS -o $OUT/utf16-be-libfuzzer -clang -DUTF16_LE harnesses/encode-harness.c $LIBS $CFLAGS -o $OUT/utf16-le-libfuzzer -clang -DWITH_READ_MAIN harnesses/encode-harness.c src/.libs/libonig.a $CFLAGS -o $OUT/main-encode -clang -DWITH_READ_MAIN -DUTF16_LE harnesses/encode-harness.c src/.libs/libonig.a $CFLAGS -o $OUT/main-utf16-le -clang -DWITH_READ_MAIN -DUTF16_BE harnesses/encode-harness.c src/.libs/libonig.a $CFLAGS -o $OUT/main-utf16-be -clang -DWITH_READ_MAIN harnesses/deluxe-encode-harness.c $LIBS $CFLAGS -o $OUT/main-deluxe-encode diff --git a/configure.ac b/configure.ac index 62c9fa5..ac51e85 100644 --- a/configure.ac +++ b/configure.ac @@ -1,5 +1,5 @@ dnl Process this file with autoconf to produce a configure script. -AC_INIT(onig, 6.9.3) +AC_INIT(onig, 6.9.4) AC_CONFIG_MACRO_DIR([m4]) diff --git a/contributed/makefile b/contributed/makefile deleted file mode 100644 index f44a3c0..0000000 --- a/contributed/makefile +++ /dev/null @@ -1,21 +0,0 @@ -ONIG_LIB=../src/.libs/libonig.a -LIBS=$(ONIG_LIB) /usr/local/lib/libLLVMFuzzerMain.a - -TARGETS=libfuzzer-onig libfuzzer-onig-full - -default: $(TARGETS) - -libfuzzer-onig: libfuzzer-onig.cpp $(ONIG_LIB) - clang++ $< $(LIBS) -o $@ -fsanitize-coverage=trace-pc-guard -fsanitize=fuzzer,address - -libfuzzer-onig-full: libfuzzer-onig.cpp $(ONIG_LIB) - clang++ -DFULL_TEST $< $(LIBS) -o $@ -fsanitize-coverage=trace-pc-guard -fsanitize=fuzzer,address - - -$(ONIG_LIB): - cd ..; ./configure CC=clang LD=clang CFLAGS="-g -fsanitize=fuzzer,address" LDFLAGS="-fsanitize-coverage=trace-pc-guard -fsanitize=fuzzer,address"; make - - - -clean: - rm -f $(TARGETS) @@ -1,4 +1,4 @@ -Oniguruma API Version 6.9.3 2019/07/06 +Oniguruma API Version 6.9.4 2019/09/30 #include <oniguruma.h> @@ -168,7 +168,7 @@ Oniguruma API Version 6.9.3 2019/07/06 # int onig_new_deluxe(regex_t** reg, const UChar* pattern, const UChar* pattern_end, OnigCompileInfo* ci, OnigErrorInfo* einfo) - This function is deprecate, and it does not allow the case where + This function is deprecated, and it does not allow the case where the encoding of pattern and target is different. Create a regex object. @@ -306,6 +306,7 @@ Oniguruma API Version 6.9.3 2019/07/06 normal return: match position offset (i.e. p - str >= 0) not found: ONIG_MISMATCH (< 0) + error: error code (< 0) arguments 1 reg: regex object @@ -342,7 +343,8 @@ Oniguruma API Version 6.9.3 2019/07/06 Do not pass invalid byte string in the regex character encoding. normal return: match length (>= 0) - not match: ONIG_MISMATCH ( < 0) + not match: ONIG_MISMATCH (< 0) + error: error code (< 0) arguments 1 reg: regex object @@ -391,6 +393,136 @@ Oniguruma API Version 6.9.3 2019/07/06 7 callback_arg: optional argument passed to callback +# int onig_regset_new(OnigRegSet** rset, int n, regex_t* regs[]) + + Create a regset object. + All regex objects must have the same character encoding. + All regex objects are prohibited from having the ONIG_OPTION_FIND_LONGEST option. + + arguments + 1 rset: return address of regset object + 2 n: number of regex in regs + 3 regs: array of regex + + normal return: ONIG_NORMAL + + +# int onig_regset_add(OnigRegSet* set, regex_t* reg) + + Add a regex into regset. + The regex object must have the same character encoding with the regset. + The regex object is prohibited from having the ONIG_OPTION_FIND_LONGEST option. + + arguments + 1 set: regset object + 2 reg: regex object + + normal return: ONIG_NORMAL + + +# int onig_regset_replace(OnigRegSet* set, int at, regex_t* reg) + + Replace a regex in regset with another one. + If the reg argument value is NULL, then remove at-th regex. (and indexes of other regexes are changed) + + arguments + 1 set: regset object + 2 at: index of regex (zero origin) + 3 reg: regex object + + normal return: ONIG_NORMAL + + +# void onig_regset_free(OnigRegSet* set) + + Free memory used by regset object and regex objects in the regset. + If the same regex object is registered twice, the situation becomes destructive. + + arguments + 1 set: regset object + + +# int onig_regset_number_of_regex(OnigRegSet* set) + + Returns number of regex objects in the regset. + + arguments + 1 set: regset object + + +# regex_t* onig_regset_get_regex(OnigRegSet* set, int at) + + Returns the regex object corresponding to the at-th regex. + + arguments + 1 set: regset object + 2 at: index of regex array (zero origin) + + +# OnigRegion* onig_regset_get_region(OnigRegSet* set, int at) + + Returns the region object corresponding to the at-th regex. + + arguments + 1 set: regset object + 2 at: index of regex array (zero origin) + + +# int onig_regset_search(OnigRegSet* set, const OnigUChar* str, const OnigUChar* end, const OnigUChar* start, const OnigUChar* range, OnigRegSetLead lead, OnigOptionType option, int* rmatch_pos) + + Perform a search with regset. + + return value: + normal return: index of match regex (zero origin) + not found: ONIG_MISMATCH (< 0) + error: error code (< 0) + + arguments + 1 set: regset object + 2 str: target string + 3 end: terminate address of target string + 4 start: search start address of target string + 5 range: search terminate address of target string + 6 lead: outer loop element + ONIG_REGSET_POSITION_LEAD (returns most left position) + ONIG_REGSET_REGEX_LEAD (returns most left position) + ONIG_REGSET_PRIORITY_TO_REGEX_ORDER (returns first match regex) + 7 option: search time option + ONIG_OPTION_NOTBOL string head(str) isn't considered as begin of line + ONIG_OPTION_NOTEOL string end (end) isn't considered as end of line + 8 rmatch_pos: return address of match position (match_address - str) + + * ONIG_REGSET_POSITION_LEAD and ONIG_REGSET_REGEX_LEAD return the same result. + These differences only appear in search time. + In most cases, ONIG_REGSET_POSITION_LEAD seems to be faster. + + +# int onig_regset_search_with_param(OnigRegSet* set, const OnigUChar* str, const OnigUChar* end, const OnigUChar* start, const OnigUChar* range, OnigRegSetLead lead, OnigOptionType option, OnigMatchParam* mps[], int* rmatch_pos) + + Perform a search with regset and match-params. + + return value: + normal return: index of match regex (zero origin) + not found: ONIG_MISMATCH (< 0) + error: error code (< 0) + + arguments + 1 set: regset object + 2 str: target string + 3 end: terminate address of target string + 4 start: search start address of target string + 5 range: search terminate address of target string + 6 lead: outer loop element + ONIG_REGSET_POSITION_LEAD (returns most left position) + ONIG_REGSET_REGEX_LEAD (returns most left position) + ONIG_REGSET_PRIORITY_TO_REGEX_ORDER (returns first match regex) + 7 option: search time option + ONIG_OPTION_NOTBOL string head(str) isn't considered as begin of line + ONIG_OPTION_NOTEOL string end (end) isn't considered as end of line + 8 mps: array of match-params + 9 rmatch_pos: return address of match position (match_address - str) + + # OnigRegion* onig_region_new(void) Create a region. @@ -1,4 +1,4 @@ -鬼車インターフェース Version 6.9.3 2019/07/06 +鬼車インターフェース Version 6.9.4 2019/09/30 #include <oniguruma.h> @@ -390,6 +390,138 @@ 7 callback_arg: コールバック関数に渡される付加引数値 +# int onig_regset_new(OnigRegSet** rset, int n, regex_t* regs[]) + + regsetオブジェクトを生成する。 + 全ての正規表現オブジェクトは、同じ文字エンコーディングでなければならない。 + 全ての正規表現オブジェクトは、ONIG_OPTION_FIND_LONGESTオプションでコンパイルされていてはならない。 + + 引数 + 1 rset: regsetオブジェクトを返すためのアドレス + 2 n: 正規表現の個数 + 3 regs: 正規表現オブジェクトの配列 + + 正常終了戻り値: ONIG_NORMAL + + +# int onig_regset_add(OnigRegSet* set, regex_t* reg) + + regsetオブジェクトに正規表現を追加する。 + 正規表現オブジェクトは、regsetと同じ文字エンコーディングでなければならない。 + 正規表現オブジェクトは、ONIG_OPTION_FIND_LONGESTオプションでコンパイルされていてはならない。 + + 引数 + 1 set: regsetオブジェクト + 2 reg: 正規表現オブジェクト + + 正常終了戻り値: ONIG_NORMAL + + +# int onig_regset_replace(OnigRegSet* set, int at, regex_t* reg) + + regsetの中の一個の正規表現オブジェクトを別のものに変更する。 + 若しreg引数の値がNULLであれば、at番目の正規表現オブジェクトを外す。(そして、以降の正規表現オブジェクトのインデックスは変化する) + + 引数 + 1 set: regsetオブジェクト + 2 at: 変更する場所のインデックス + 2 reg: 正規表現オブジェクト + + 正常終了戻り値: ONIG_NORMAL + + +# void onig_regset_free(OnigRegSet* set) + + regsetオブジェクトとその中の正規表現オブジェクトの使用メモリを開放する。 + 若し、同一の正規表現オブジェクトを重複して登録していれば、破壊的な状況になる。 + + 引数 + 1 set: regsetオブジェクト + + +# int onig_regset_number_of_regex(OnigRegSet* set) + + regsetの中の正規表現オブジェクトの個数を返す。 + + 引数 + 1 set: regsetオブジェクト + + +# regex_t* onig_regset_get_regex(OnigRegSet* set, int at) + + regsetのat番目の正規表現を返す。 + + 引数 + 1 set: regsetオブジェクト + 2 at: 正規表現オブジェクトのインデックス (ゼロ開始) + + +# OnigRegion* onig_regset_get_region(OnigRegSet* set, int at) + + regsetのat番目の正規表現に対応する領域を返す。 + + 引数 + 1 set: regsetオブジェクト + 2 at: 正規表現オブジェクトのインデックス (ゼロ開始) + + +# int onig_regset_search(OnigRegSet* set, const OnigUChar* str, const OnigUChar* end, const OnigUChar* start, const OnigUChar* range, OnigRegSetLead lead, OnigOptionType option, int* rmatch_pos) + + regsetによる検索を実行する。 + + 戻り値: + 検索成功: マッチした正規表現オブジェクトのインデックス (ゼロ開始) + 検索失敗: ONIG_MISMATCH (< 0) + エラー: エラーコード (< 0) + + 引数 + 1 set: regsetオブジェクト + 2 str: 検索対象文字列 + 3 end: 検索対象文字列の終端アドレス + 4 start: 検索対象文字列の検索先頭位置アドレス + 5 range: 検索対象文字列の検索終了位置アドレス + (start <= 探索される文字列 < range) + 6 lead: 外側のループ要素 + ONIG_REGSET_POSITION_LEAD (最左位置でマッチした結果を返す) + ONIG_REGSET_REGEX_LEAD (最左位置でマッチした結果を返す) + ONIG_REGSET_PRIORITY_TO_REGEX_ORDER (最初にマッチした正規表現の結果を返す) + 7 option: 検索時オプション + ONIG_OPTION_NOTBOL 文字列の先頭(str)を行頭と看做さない + ONIG_OPTION_NOTEOL 文字列の終端(end)を行末と看做さない + 8 rmatch_pos: マッチした位置を返すためのアドレス (match_address - str) + + * ONIG_REGSET_POSITION_LEADとONIG_REGSET_REGEX_LEADは同じ結果を返す。 + これらの違いは検索時間にしか現れない。 + ほとんどの場合、ONIG_REGSET_POSITION_LEADのほうが速いと思われる。 + + +# int onig_regset_search_with_param(OnigRegSet* set, const OnigUChar* str, const OnigUChar* end, const OnigUChar* start, const OnigUChar* range, OnigRegSetLead lead, OnigOptionType option, OnigMatchParam* mps[], int* rmatch_pos) + + regsetとOnigMatchParamオブジェクトによる検索を実行する。 + + 戻り値: + 検索成功: マッチした正規表現オブジェクトのインデックス (ゼロ開始) + 検索失敗: ONIG_MISMATCH (< 0) + エラー: エラーコード (< 0) + + 引数 + 1 set: regsetオブジェクト + 2 str: 検索対象文字列 + 3 end: 検索対象文字列の終端アドレス + 4 start: 検索対象文字列の検索先頭位置アドレス + 5 range: 検索対象文字列の検索終了位置アドレス + (start <= 探索される文字列 < range) + 6 lead: 外側のループ要素 + ONIG_REGSET_POSITION_LEAD (最左位置でマッチした結果を返す) + ONIG_REGSET_REGEX_LEAD (最左位置でマッチした結果を返す) + ONIG_REGSET_PRIORITY_TO_REGEX_ORDER (最初にマッチした正規表現の結果を返す) + 7 option: 検索時オプション + ONIG_OPTION_NOTBOL 文字列の先頭(str)を行頭と看做さない + ONIG_OPTION_NOTEOL 文字列の終端(end)を行末と看做さない + 8 mps: OnigMatchParamオブジェクトの配列 + 9 rmatch_pos: マッチした位置を返すためのアドレス (match_address - str) + + # OnigRegion* onig_region_new(void) マッチ領域情報(region)を作成する。 @@ -1,4 +1,4 @@ -Oniguruma Regular Expressions Version 6.9.2 2019/03/29 +Oniguruma Regular Expressions Version 6.9.4 2019/10/31 syntax: ONIG_SYNTAX_ONIGURUMA (default) @@ -289,6 +289,11 @@ syntax: ONIG_SYNTAX_ONIGURUMA (default) In negative look-behind, capturing group isn't allowed, but non-capturing group (?:) is allowed. + * In look-behind and negative look-behind, support for + ignore-case option is limited. Only supports conversion + between single characters. (Does not support conversion + of multiple characters in Unicode) + (?>subexp) atomic group no backtracks in subexp. @@ -338,7 +343,7 @@ syntax: ONIG_SYNTAX_ONIGURUMA (default) This works like .* (more precisely \O*), but it is limited by the range that does not include the string match with <absent>. - This is a written abbreviation of (?~|absent|\O*). + This is a written abbreviation of (?~|(?:absent)|\O*). \O* is used as a repeater. (?~|absent|exp) Absent expression (* original) @@ -1,4 +1,4 @@ -鬼車 正規表現 Version 6.9.2 2019/03/29 +鬼車 正規表現 Version 6.9.4 2019/10/31 使用文法: ONIG_SYNTAX_ONIGURUMA (既定値) @@ -21,10 +21,10 @@ \f 改頁 (0x0C) \a 鐘 (0x07) \e 退避修飾 (0x1B) - \nnn 八進数表現 符号化バイト値(の一部) + \nnn 八進数表現 符号化バイト値 \o{17777777777} 拡張八進数表現 コードポイント値 \uHHHH 拡張十六進数表現 コードポイント値 - \xHH 十六進数表現 符号化バイト値(の一部) + \xHH 十六進数表現 符号化バイト値 \x{7HHHHHHH} 拡張十六進数表現 コードポイント値 \cx 制御文字表現 コードポイント値 \C-x 制御文字表現 コードポイント値 @@ -284,6 +284,10 @@ 否定戻り読みでは、捕獲式集合は許されないが、 非捕獲式集合は許される。 + * 戻り読み、否定戻り読みの中では、ignore-caseオプションの + 対応が制限される。一文字と一文字の間の変換しか対応しない。 + (Unicodeでの複数文字の変換に対応しない) + (?>式) 原子的式集合 式全体を通過したとき、式の中での後退再試行を行なわない @@ -334,20 +338,20 @@ <不在機能群> - (?~不在式) 不在繰り返し (*原案 田中哲) - これは .*(より正確には\O*)のように動作するが、<不在式>に + (?~不在) 不在繰り返し (*原案 田中哲) + これは .*(より正確には\O*)のように動作するが、<不在>に 適合する文字列を含まない範囲に制限される。 - これは(?~|不在式|\O*)の省略表記である。 + これは(?~|(?:不在)|\O*)の省略表記である。 - (?~|不在式|式) 不在式 (* 原作) - これは<式>のように動作するが、<不在式>に適合する文字列を + (?~|不在|式) 不在式 (* 原作) + これは<式>のように動作するが、<不在>に適合する文字列を 含まない範囲に制限される。 例 (?~|345|\d*) "12345678" ==> "12", "1", "" - (?~|不在式) 不在停止 (* 原作) + (?~|不在) 不在停止 (* 原作) この演算子を通過した後は、対象文字列の適合範囲が - <不在式>に適合する文字列を含まない範囲に制限される。 + <不在>に適合する文字列を含まない範囲に制限される。 (?~|) 範囲消去 不在停止の効果を消して、それ以前の状態にする。 diff --git a/doc/SYNTAX.md b/doc/SYNTAX.md index 449f262..69ecf3a 100644 --- a/doc/SYNTAX.md +++ b/doc/SYNTAX.md @@ -1,7 +1,7 @@ # Oniguruma syntax (operator) configuration -_Documented for Oniguruma 6.9.2 (2019/03/28)_ +_Documented for Oniguruma 6.9.3 (2019/08/08)_ ---------- @@ -960,6 +960,12 @@ _Set in: Ruby, Oniguruma_ If this flag is set, Oniguruma will warn about nested repeat operators those have no meaning, like `(?:a*)+`. If this flag is clear, Oniguruma will allow the nested repeat operators without warning about them. +### 26. ONIG_SYN_ALLOW_INVALID_CODE_END_OF_RANGE_IN_CC (allow [a-\x{7fffffff}]) + +_Set in: Oniguruma_ + +If this flag is set, then invalid code points at the end of range in character class are allowed. + ### 31. ONIG_SYN_CONTEXT_INDEP_ANCHORS _Set in: PosixExtended, GnuRegex, Java, Perl, Perl_NG, Ruby, Oniguruma_ @@ -1066,4 +1072,5 @@ These tables show which of the built-in syntaxes use which flags and options, fo | 23 | `ONIG_SYN_ALLOW_DOUBLE_RANGE_OP_IN_CC` | - | Yes | - | - | Yes | Yes | Yes | Yes | Yes | Yes | | 24 | `ONIG_SYN_WARN_CC_OP_NOT_ESCAPED` | - | - | - | - | - | - | - | - | Yes | Yes | | 25 | `ONIG_SYN_WARN_REDUNDANT_NESTED_REPEAT` | - | - | - | - | - | - | - | - | Yes | Yes | +| 26 | `ONIG_SYN_ALLOW_INVALID_CODE_END_OF_RANGE_IN_CC` | - | - | - | - | - | - | - | - | - | Yes | | 31 | `ONIG_SYN_CONTEXT_INDEP_ANCHORS` | - | Yes | - | - | Yes | Yes | Yes | Yes | Yes | Yes | diff --git a/doc/UNICODE_PROPERTIES b/doc/UNICODE_PROPERTIES index ff2a6ce..24c2031 100644 --- a/doc/UNICODE_PROPERTIES +++ b/doc/UNICODE_PROPERTIES @@ -1,4 +1,4 @@ -Unicode Properties (from Unicode Version: 12.1.0) +Unicode Properties (Unicode Version: 12.1.0, Emoji: 12.1) 15: ASCII_Hex_Digit 16: Adlam diff --git a/harnesses/ascii_compatible.dict b/harnesses/ascii_compatible.dict index 820bf47..e6e00db 100644 --- a/harnesses/ascii_compatible.dict +++ b/harnesses/ascii_compatible.dict @@ -1,10 +1,7 @@ # First-pass fuzzing dictionary for Oniguruma by Mark Griffin -"\\o{17777777777}" -"\\777" -"\\u" -"\\uFFFF" -"\\xFF" -"\\x{70000000}" +"\\o{34}" +"\\123" +"\\x{40}" "\\C-" "\\M-\\C-" "\\X" @@ -12,6 +9,8 @@ "\\p{^" "}" "]" +"]" +")" ")" "\\n" "\\r" @@ -47,10 +46,13 @@ "\\B" "(?y{" "[abcd1-9]" +"[\\w]" +"[\\W]" +"[\\s]" +"[\\S]" "[\\w\\d" "[\\p{Alphabetic}" -"[\\P{Arabic}" -"[\\x{ffff}" +"[\\x{03}" "[a-w&&" "[^" "[:graph:]" @@ -88,7 +90,6 @@ "(?(<name+0>))" "(?(<name+1>))" "(?(<name-1>))" -"(*ERROR{-2000})" "(*COUNT[tag]{X})" "\\1" "\\2" @@ -106,6 +107,5 @@ "(?<name>a|b\\g<name>c)" "(?-i:\\g<name>)" "\\N{name}" -"\\p{Hiragana}" "\\p{Katakana}" "\\p{Emoji}" diff --git a/harnesses/deluxe-encode-harness.c b/harnesses/deluxe-encode-harness.c index e1f84a5..aabe916 100644 --- a/harnesses/deluxe-encode-harness.c +++ b/harnesses/deluxe-encode-harness.c @@ -49,39 +49,6 @@ search(regex_t* reg, unsigned char* str, unsigned char* end) return 0; } -static int -exec(OnigEncoding enc, OnigOptionType options, - char* apattern, char* apattern_end, char* astr, char* astr_end) -{ - int r; - regex_t* reg; - OnigErrorInfo einfo; - UChar* pattern = (UChar* )apattern; - UChar* str = (UChar* )astr; - UChar* pattern_end = (UChar* )apattern_end; - unsigned char *end = (unsigned char* )astr_end; - - onig_initialize(&enc, 1); - onig_set_retry_limit_in_match(DEFAULT_LIMIT); - onig_set_parse_depth_limit(DEFAULT_LIMIT); - - r = onig_new(®, pattern, pattern_end, - options, enc, ONIG_SYNTAX_DEFAULT, &einfo); - if (r != ONIG_NORMAL) { - char s[ONIG_MAX_ERROR_MESSAGE_LEN]; - onig_error_code_to_str((UChar* )s, r, &einfo); - fprintf(stdout, "ERROR: %s\n", s); - onig_end(); - return -1; - } - - r = search(reg, str, end); - - onig_free(reg); - onig_end(); - return 0; -} - static OnigCaseFoldType CF = ONIGENC_CASE_FOLD_MIN; static int @@ -196,15 +163,13 @@ int LLVMFuzzerTestOneInput(const uint8_t * Data, size_t Size) remaining_size--; // copy first PATTERN_SIZE bytes off to be the pattern - pattern = (unsigned char *)malloc(PATTERN_SIZE+4); - memset(pattern, 0, PATTERN_SIZE+4); + pattern = (unsigned char *)malloc(PATTERN_SIZE); memcpy(pattern, data, PATTERN_SIZE); pattern_end = pattern + PATTERN_SIZE; data += PATTERN_SIZE; remaining_size -= PATTERN_SIZE; - str = (unsigned char*)malloc(remaining_size+4); - memset(str, 0, remaining_size+4); + str = (unsigned char*)malloc(remaining_size); memcpy(str, data, remaining_size); str_end = str + remaining_size; diff --git a/harnesses/encode-harness.c b/harnesses/encode-harness.c index e57fd4f..5db0512 100644 --- a/harnesses/encode-harness.c +++ b/harnesses/encode-harness.c @@ -3,13 +3,19 @@ * contributed by Mark Griffin */ #include <stdio.h> -#include "oniguruma.h" - +#include <unistd.h> #include <stdlib.h> #include <string.h> +#include <sys/types.h> +#include <sys/stat.h> +#include <fcntl.h> +#include <time.h> -#define PARSE_DEPTH_LIMIT 120 -#define RETRY_LIMIT 4000 +#include "oniguruma.h" + + +//#define PARSE_DEPTH_LIMIT 120 +#define RETRY_LIMIT 3500 typedef unsigned char uint8_t; @@ -26,6 +32,7 @@ search(regex_t* reg, unsigned char* str, unsigned char* end) range = end; r = onig_search(reg, str, end, start, range, region, ONIG_OPTION_NONE); if (r >= 0) { +#ifdef WITH_READ_MAIN int i; fprintf(stdout, "match at %d (%s)\n", r, @@ -33,17 +40,29 @@ search(regex_t* reg, unsigned char* str, unsigned char* end) for (i = 0; i < region->num_regs; i++) { fprintf(stdout, "%d: (%d-%d)\n", i, region->beg[i], region->end[i]); } +#endif } else if (r == ONIG_MISMATCH) { +#ifdef WITH_READ_MAIN fprintf(stdout, "search fail (%s)\n", ONIGENC_NAME(onig_get_encoding(reg))); +#endif } else { /* error */ +#ifdef WITH_READ_MAIN char s[ONIG_MAX_ERROR_MESSAGE_LEN]; + onig_error_code_to_str((UChar* )s, r); fprintf(stdout, "ERROR: %s\n", s); fprintf(stdout, " (%s)\n", ONIGENC_NAME(onig_get_encoding(reg))); +#endif onig_region_free(region, 1 /* 1:free self, 0:free contents only */); + + if (r == ONIGERR_STACK_BUG || + r == ONIGERR_UNDEFINED_BYTECODE || + r == ONIGERR_UNEXPECTED_BYTECODE) + return -2; + return -1; } @@ -51,8 +70,14 @@ search(regex_t* reg, unsigned char* str, unsigned char* end) return 0; } +static long INPUT_COUNT; +static long EXEC_COUNT; +static long EXEC_COUNT_INTERVAL; +static long REGEX_SUCCESS_COUNT; +static long VALID_STRING_COUNT; + static int -exec(OnigEncoding enc, OnigOptionType options, +exec(OnigEncoding enc, OnigOptionType options, OnigSyntaxType* syntax, char* apattern, char* apattern_end, char* astr, UChar* end) { int r; @@ -62,22 +87,41 @@ exec(OnigEncoding enc, OnigOptionType options, UChar* str = (UChar* )astr; UChar* pattern_end = (UChar* )apattern_end; + EXEC_COUNT++; + EXEC_COUNT_INTERVAL++; + onig_initialize(&enc, 1); onig_set_retry_limit_in_match(RETRY_LIMIT); - onig_set_parse_depth_limit(PARSE_DEPTH_LIMIT); + //onig_set_parse_depth_limit(PARSE_DEPTH_LIMIT); r = onig_new(®, pattern, pattern_end, - options, enc, ONIG_SYNTAX_DEFAULT, &einfo); + options, enc, syntax, &einfo); if (r != ONIG_NORMAL) { char s[ONIG_MAX_ERROR_MESSAGE_LEN]; onig_error_code_to_str((UChar* )s, r, &einfo); +#ifdef WITH_READ_MAIN fprintf(stdout, "ERROR: %s\n", s); +#endif onig_end(); - return -1; + + if (r == ONIGERR_PARSER_BUG || + r == ONIGERR_STACK_BUG || + r == ONIGERR_UNDEFINED_BYTECODE || + r == ONIGERR_UNEXPECTED_BYTECODE) { + return -2; + } + else + return -1; } + REGEX_SUCCESS_COUNT++; + + r = search(reg, pattern, pattern_end); + if (r == -2) return -2; if (onigenc_is_valid_mbc_string(enc, str, end) != 0) { + VALID_STRING_COUNT++; r = search(reg, str, end); + if (r == -2) return -2; } onig_free(reg); @@ -85,52 +129,114 @@ exec(OnigEncoding enc, OnigOptionType options, return 0; } -#define PATTERN_SIZE 32 -#define NUM_CONTROL_BYTES 1 -#define MIN_STR_SIZE 1 -int LLVMFuzzerTestOneInput(const uint8_t * Data, size_t Size) +#if 0 +static void +output_data(char* path, const uint8_t * data, size_t size) { - if (Size <= (NUM_CONTROL_BYTES + PATTERN_SIZE + MIN_STR_SIZE)) - return 0; - if (Size > 0x1000) - return 0; + int fd; + ssize_t n; + fd = open(path, O_CREAT|O_RDWR, S_IRUSR|S_IRGRP|S_IROTH); + if (fd == -1) { + fprintf(stderr, "ERROR: output_data(): can't open(%s)\n", path); + return ; + } + + n = write(fd, (const void* )data, size); + if (n != size) { + fprintf(stderr, "ERROR: output_data(): n: %ld, size: %ld\n", n, size); + } + close(fd); +} +#endif + + +static int +alloc_exec(OnigEncoding enc, OnigOptionType options, OnigSyntaxType* syntax, + int pattern_size, size_t remaining_size, unsigned char *data) +{ + int r; unsigned char *pattern_end; unsigned char *str_null_end; - size_t remaining_size = Size; - unsigned char *data = (unsigned char *)(Data); + // copy first PATTERN_SIZE bytes off to be the pattern + unsigned char *pattern = (unsigned char *)malloc(pattern_size != 0 ? pattern_size : 1); + memcpy(pattern, data, pattern_size); + pattern_end = pattern + pattern_size; + data += pattern_size; + remaining_size -= pattern_size; - // pull off one byte to switch off - unsigned char encoding_choice = data[0]; - data++; - remaining_size--; +#if defined(UTF16_BE) || defined(UTF16_LE) + if (remaining_size % 2 == 1) remaining_size--; +#endif - // copy first PATTERN_SIZE bytes off to be the pattern - unsigned char *pattern = (unsigned char *)malloc(PATTERN_SIZE+4); - memset(pattern, 0, PATTERN_SIZE+4); - memcpy(pattern, data, PATTERN_SIZE); - pattern_end = pattern + PATTERN_SIZE; - data += PATTERN_SIZE; - remaining_size -= PATTERN_SIZE; - - unsigned char *str = (unsigned char*)malloc(remaining_size+4); - memset(str, 0, remaining_size+4); + unsigned char *str = (unsigned char*)malloc(remaining_size != 0 ? remaining_size : 1); memcpy(str, data, remaining_size); str_null_end = str + remaining_size; - int r; - OnigEncodingType *encodings[] = { - ONIG_ENCODING_SJIS, - ONIG_ENCODING_EUC_JP, - ONIG_ENCODING_CP1251, - ONIG_ENCODING_ISO_8859_1, - ONIG_ENCODING_UTF8, - ONIG_ENCODING_KOI8_R, - ONIG_ENCODING_BIG5 + r = exec(enc, options, syntax, + (char *)pattern, (char *)pattern_end, + (char *)str, str_null_end); + + free(pattern); + free(str); + return r; +} + + +#define EXEC_PRINT_INTERVAL 10000000 +#define MAX_PATTERN_SIZE 150 + +#ifdef SYNTAX_TEST +#define NUM_CONTROL_BYTES 3 +#else +#define NUM_CONTROL_BYTES 2 +#endif + +int LLVMFuzzerTestOneInput(const uint8_t * Data, size_t Size) +{ +#if !defined(UTF16_BE) && !defined(UTF16_LE) + static OnigEncoding encodings[] = { + ONIG_ENCODING_UTF8, + ONIG_ENCODING_UTF8, + ONIG_ENCODING_UTF8, + ONIG_ENCODING_SJIS, + //ONIG_ENCODING_EUC_JP, + ONIG_ENCODING_ISO_8859_1, + ONIG_ENCODING_BIG5, + ONIG_ENCODING_GB18030, + ONIG_ENCODING_EUC_TW + }; + unsigned char encoding_choice; +#endif + +#ifdef SYNTAX_TEST + static OnigSyntaxType* syntaxes[] = { + ONIG_SYNTAX_POSIX_EXTENDED, + ONIG_SYNTAX_EMACS, + ONIG_SYNTAX_GREP, + ONIG_SYNTAX_GNU_REGEX, + ONIG_SYNTAX_JAVA, + ONIG_SYNTAX_PERL_NG, + ONIG_SYNTAX_ONIGURUMA }; + unsigned char syntax_choice; +#endif + + int r; + int pattern_size; + size_t remaining_size; + unsigned char *data; + unsigned char options_choice; + OnigOptionType options; + OnigEncoding enc; + OnigSyntaxType* syntax; - OnigEncodingType *enc; + INPUT_COUNT++; + if (Size < NUM_CONTROL_BYTES) return 0; + + remaining_size = Size; + data = (unsigned char* )(Data); #ifdef UTF16_BE enc = ONIG_ENCODING_UTF16_BE; @@ -138,24 +244,113 @@ int LLVMFuzzerTestOneInput(const uint8_t * Data, size_t Size) #ifdef UTF16_LE enc = ONIG_ENCODING_UTF16_LE; #else + encoding_choice = data[0]; + data++; + remaining_size--; + int num_encodings = sizeof(encodings)/sizeof(encodings[0]); enc = encodings[encoding_choice % num_encodings]; #endif #endif - r = exec(enc, ONIG_OPTION_NONE, (char *)pattern, (char *)pattern_end, - (char *)str, str_null_end); +#ifdef SYNTAX_TEST + syntax_choice = data[0]; + data++; + remaining_size--; - free(pattern); - free(str); + int num_syntaxes = sizeof(syntaxes)/sizeof(syntaxes[0]); + syntax = syntaxes[syntax_choice % num_syntaxes]; +#else + syntax = ONIG_SYNTAX_DEFAULT; +#endif + + options_choice = data[0]; + options = (options_choice % 2 == 0) ? ONIG_OPTION_NONE : ONIG_OPTION_IGNORECASE; + data++; + remaining_size--; + +#ifdef WITH_READ_MAIN +#ifdef SYNTAX_TEST + fprintf(stdout, "enc: %s, syntax: %d, options: %u\n", + ONIGENC_NAME(enc), (int )(syntax_choice % num_syntaxes), options); +#else + fprintf(stdout, "enc: %s, options: %u\n", ONIGENC_NAME(enc), options); +#endif +#endif +#ifdef WITH_READ_MAIN + int max_pattern_size; + + if (remaining_size == 0) + max_pattern_size = 0; + else { + max_pattern_size = remaining_size - 1; + if (max_pattern_size > MAX_PATTERN_SIZE) + max_pattern_size = MAX_PATTERN_SIZE; + +#if defined(UTF16_BE) || defined(UTF16_LE) + if (max_pattern_size % 2 == 1) max_pattern_size--; +#endif + } + + for (pattern_size = 0; pattern_size <= max_pattern_size; ) { + fprintf(stdout, "pattern_size: %d\n", pattern_size); + r = alloc_exec(enc, options, syntax, pattern_size, remaining_size, data); + if (r == -2) { + //output_data("parser-bug", Data, Size); + exit(-2); + } + +#if defined(UTF16_BE) || defined(UTF16_LE) + pattern_size += 2; +#else + pattern_size++; +#endif + } + +#else /* WITH_READ_MAIN */ + + if (remaining_size == 0) + pattern_size = 0; + else { + pattern_size = INPUT_COUNT % remaining_size; + if (pattern_size > MAX_PATTERN_SIZE) + pattern_size = MAX_PATTERN_SIZE; + +#if defined(UTF16_BE) || defined(UTF16_LE) + if (pattern_size % 2 == 1) pattern_size--; +#endif + } + + r = alloc_exec(enc, options, syntax, pattern_size, remaining_size, data); + if (r == -2) { + //output_data("parser-bug", Data, Size); + exit(-2); + } +#endif /* else WITH_READ_MAIN */ + + if (EXEC_COUNT_INTERVAL == EXEC_PRINT_INTERVAL) { + char d[64]; + time_t t; + float fexec, freg, fvalid; + + t = time(NULL); + strftime(d, sizeof(d), "%m/%d %H:%M:%S", localtime(&t)); + + fexec = (float )EXEC_COUNT / INPUT_COUNT; + freg = (float )REGEX_SUCCESS_COUNT / INPUT_COUNT; + fvalid = (float )VALID_STRING_COUNT / INPUT_COUNT; + + fprintf(stdout, "%s: %ld: EXEC:%.2f, REG:%.2f, VALID:%.2f\n", + d, EXEC_COUNT, fexec, freg, fvalid); + + EXEC_COUNT_INTERVAL = 0; + } return r; } #ifdef WITH_READ_MAIN -#include <unistd.h> - extern int main(int argc, char* argv[]) { size_t n; diff --git a/contributed/libfuzzer-onig.cpp b/harnesses/libfuzzer-onig.cpp index 526c826..526c826 100644 --- a/contributed/libfuzzer-onig.cpp +++ b/harnesses/libfuzzer-onig.cpp diff --git a/harnesses/makefile b/harnesses/makefile new file mode 100644 index 0000000..dfd84de --- /dev/null +++ b/harnesses/makefile @@ -0,0 +1,69 @@ +# makefile for harness +SRC = ../src +CFLAGS = -I$(SRC) -Wall -g -fsanitize=fuzzer,address -fno-omit-frame-pointer +CFLAGS_M = -I$(SRC) -Wall -g -fsanitize=fuzzer-no-link,address -fno-omit-frame-pointer -DWITH_READ_MAIN +ONIG_LIB = $(SRC)/.libs/libonig.a +LIBS = $(ONIG_LIB) + +TARGETS = encode-libfuzzer syntax-libfuzzer \ + utf16-be-libfuzzer utf16-le-libfuzzer main-encode main-syntax \ + main-utf16-be main-utf16-le main-regset regset-libfuzzer + +OTHER_TARGETS = libfuzzer-onig libfuzzer-onig-full \ + deluxe-encode-libfuzzer main-deluxe-encode + + +default: $(TARGETS) + +encode-libfuzzer: encode-harness.c $(ONIG_LIB) + clang $(CFLAGS) $< $(LIBS) -o $@ + +syntax-libfuzzer: encode-harness.c $(ONIG_LIB) + clang -DSYNTAX_TEST $(CFLAGS) $< $(LIBS) -o $@ + +deluxe-encode-libfuzzer: deluxe-encode-harness.c $(ONIG_LIB) + clang $(CFLAGS) $< $(LIBS) -o $@ + +utf16-be-libfuzzer: encode-harness.c $(ONIG_LIB) + clang -DUTF16_BE $(CFLAGS) $< $(LIBS) -o $@ + +utf16-le-libfuzzer: encode-harness.c $(ONIG_LIB) + clang -DUTF16_LE $(CFLAGS) $< $(LIBS) -o $@ + +regset-libfuzzer: regset-harness.c $(ONIG_LIB) + clang $(CFLAGS) $< $(LIBS) -o $@ + +main-encode: encode-harness.c $(ONIG_LIB) + clang $(CFLAGS_M) $< $(LIBS) -o $@ + +main-syntax: encode-harness.c $(ONIG_LIB) + clang -DSYNTAX_TEST $(CFLAGS_M) $< $(LIBS) -o $@ + +main-deluxe-encode: deluxe-encode-harness.c $(ONIG_LIB) + clang $(CFLAGS_M) $< $(LIBS) -o $@ + +main-utf16-be: encode-harness.c $(ONIG_LIB) + clang -DUTF16_BE $(CFLAGS_M) $< $(LIBS) -o $@ + +main-utf16-le: encode-harness.c $(ONIG_LIB) + clang -DUTF16_LE $(CFLAGS_M) $< $(LIBS) -o $@ + +main-regset: regset-harness.c $(ONIG_LIB) + clang $(CFLAGS_M) $< $(LIBS) -o $@ + +libfuzzer-onig: libfuzzer-onig.cpp $(ONIG_LIB) + clang++ $(CFLAGS) $< $(LIBS) -o $@ + +libfuzzer-onig-full: libfuzzer-onig.cpp $(ONIG_LIB) + clang++ -DFULL_TEST $(CFLAGS) $< $(LIBS) -o $@ + + +$(ONIG_LIB): + cd ..; make clean + #cd ..; autoreconf -vfi + cd ..; ./configure CC=clang LD=clang CFLAGS="-g -fsanitize=address -fno-omit-frame-pointer" LDFLAGS="-g -fsanitize=address -fno-omit-frame-pointer" + cd ..; make -j4 + + +clean: + rm -f $(TARGETS) $(OTHER_TARGETS) diff --git a/harnesses/regset-harness.c b/harnesses/regset-harness.c new file mode 100644 index 0000000..b4b7e20 --- /dev/null +++ b/harnesses/regset-harness.c @@ -0,0 +1,379 @@ +/* + * regset-harness.c + * Copyright (c) 2019 K.Kosako + */ +#include <stdio.h> +#include <unistd.h> +#include <stdlib.h> +#include <string.h> +#include <sys/types.h> +#include <sys/stat.h> +#include <fcntl.h> +#include <time.h> + +#include "oniguruma.h" + + +#define RETRY_LIMIT 500 + +#ifdef WITH_READ_MAIN +//#define CHECK_EACH_REGEX_SEARCH_TIME +#endif + +#define MAX_REG_NUM 256 + +typedef unsigned char uint8_t; +static OnigEncoding ENC; + +#ifdef CHECK_EACH_REGEX_SEARCH_TIME +static double +get_sec(struct timespec* ts, struct timespec* te) +{ + double t; + + t = (te->tv_sec - ts->tv_sec) + + (double )(te->tv_nsec - ts->tv_nsec) / 1000000000.0; + return t; +} + +static int +check_each_regex_search_time(OnigRegSet* set, unsigned char* str, unsigned char* end) +{ + int n; + int i; + int r; + OnigRegion* region; + + n = onig_regset_number_of_regex(set); + region = onig_region_new(); + + for (i = 0; i < n; i++) { + regex_t* reg; + unsigned char* start; + unsigned char* range; + struct timespec ts1, ts2; + double t; + + reg = onig_regset_get_regex(set, i); + start = str; + range = end; + + clock_gettime(CLOCK_PROCESS_CPUTIME_ID, &ts1); + + r = onig_search(reg, str, end, start, range, region, ONIG_OPTION_NONE); + + clock_gettime(CLOCK_PROCESS_CPUTIME_ID, &ts2); + t = get_sec(&ts1, &ts2); + + fprintf(stdout, "regex search time %d: %6.2lfmsec.\n", i, t * 1000.0); + } + + onig_region_free(region, 1); + return 0; +} +#endif + +static int +search(OnigRegSet* set, OnigRegSetLead lead, unsigned char* str, unsigned char* end) +{ + int r; + int match_pos; + unsigned char *start, *range; + + start = str; + range = end; + r = onig_regset_search(set, str, end, start, range, lead, + ONIG_OPTION_NONE, &match_pos); + if (r >= 0) { +#ifdef WITH_READ_MAIN + int i; + int match_index; + OnigRegion* region; + + match_index = r; + fprintf(stdout, "match reg index: %d, pos: %d (%s)\n", + match_index, match_pos, ONIGENC_NAME(ENC)); + region = onig_regset_get_region(set, match_index); + if (region == 0) { + fprintf(stdout, "ERROR: can't get region.\n"); + return -1; + } + + for (i = 0; i < region->num_regs; i++) { + fprintf(stdout, "%d: (%d-%d)\n", i, region->beg[i], region->end[i]); + } +#endif + } + else if (r == ONIG_MISMATCH) { +#ifdef WITH_READ_MAIN + fprintf(stdout, "search fail (%s)\n", ONIGENC_NAME(ENC)); +#endif + } + else { /* error */ +#ifdef WITH_READ_MAIN + char s[ONIG_MAX_ERROR_MESSAGE_LEN]; + + onig_error_code_to_str((UChar* )s, r); + fprintf(stdout, "ERROR: %s\n", s); + fprintf(stdout, " (%s)\n", ONIGENC_NAME(ENC)); +#endif + return -1; + } + + return 0; +} + +static long INPUT_COUNT; +static long EXEC_COUNT; +static long EXEC_COUNT_INTERVAL; +static long REGEX_SUCCESS_COUNT; +static long VALID_STRING_COUNT; + +static int +exec(OnigEncoding enc, int reg_num, int init_reg_num, + UChar* pat[], UChar* pat_end[], + OnigRegSetLead lead, UChar* str, UChar* end) +{ + int r; + int i, j; + OnigRegSet* set; + regex_t* reg; + OnigOptionType options; + OnigErrorInfo einfo; + regex_t* regs[MAX_REG_NUM]; + + EXEC_COUNT++; + EXEC_COUNT_INTERVAL++; + + options = (EXEC_COUNT % 4 == 0) ? ONIG_OPTION_IGNORECASE : ONIG_OPTION_NONE; + + onig_initialize(&enc, 1); + onig_set_retry_limit_in_match(RETRY_LIMIT); + + for (i = 0; i < init_reg_num; i++) { + r = onig_new(®s[i], pat[i], pat_end[i], options, ENC, + ONIG_SYNTAX_DEFAULT, &einfo); + if (r != 0) { +#ifdef WITH_READ_MAIN + char s[ONIG_MAX_ERROR_MESSAGE_LEN]; + + onig_error_code_to_str((UChar* )s, r, &einfo); + fprintf(stdout, "ERROR: index: %d, %s\n", i, s); +#endif + + for (j = 0; j < i; j++) onig_free(regs[j]); + + onig_end(); + + if (r == ONIGERR_PARSER_BUG || + r == ONIGERR_STACK_BUG || + r == ONIGERR_UNDEFINED_BYTECODE || + r == ONIGERR_UNEXPECTED_BYTECODE) { + return -2; + } + else + return -1; + } + } + + r = onig_regset_new(&set, init_reg_num, regs); + if (r != 0) { + for (i = 0; i < init_reg_num; i++) { + onig_free(regs[i]); + } + onig_end(); + return -1; + } + + for (i = init_reg_num; i < reg_num; i++) { + r = onig_new(®, pat[i], pat_end[i], options, ENC, + ONIG_SYNTAX_DEFAULT, &einfo); + if (r != 0) { +#ifdef WITH_READ_MAIN + char s[ONIG_MAX_ERROR_MESSAGE_LEN]; + + onig_error_code_to_str((UChar* )s, r, &einfo); + fprintf(stdout, "ERROR: index: %d, %s\n", i, s); +#endif + onig_regset_free(set); + onig_end(); + + if (r == ONIGERR_PARSER_BUG || + r == ONIGERR_STACK_BUG || + r == ONIGERR_UNDEFINED_BYTECODE || + r == ONIGERR_UNEXPECTED_BYTECODE) { + return -2; + } + else + return -1; + } + + r = onig_regset_add(set, reg); + if (r != 0) { + onig_regset_free(set); + onig_end(); + fprintf(stdout, "ERROR: onig_regset_add(): %d\n", i); + return r; + } + } + + REGEX_SUCCESS_COUNT++; + + if (onigenc_is_valid_mbc_string(enc, str, end) != 0) { + VALID_STRING_COUNT++; + r = search(set, lead, str, end); +#ifdef CHECK_EACH_REGEX_SEARCH_TIME + r = check_each_regex_search_time(set, str, end); +#endif + } + + onig_regset_free(set); + onig_end(); + return 0; +} + +#define MAX_PATTERN_SIZE 30 +#define NUM_CONTROL_BYTES 3 + +#define EXEC_PRINT_INTERVAL 2000000 + +static int MaxRegNum; +static int MaxInitRegNum; + +extern int +LLVMFuzzerTestOneInput(const uint8_t * Data, size_t Size) +{ + int r, i; + int pattern_size; + unsigned char *str_null_end; + size_t remaining_size; + unsigned char *data; + unsigned int reg_num; + unsigned int init_reg_num; + unsigned char* pat[256]; + unsigned char* pat_end[256]; + int len; + unsigned int lead_num; + OnigRegSetLead lead; + + INPUT_COUNT++; + + if (Size < NUM_CONTROL_BYTES) return 0; + + remaining_size = Size; + data = (unsigned char* )(Data); + + reg_num = data[0]; + data++; + remaining_size--; + + init_reg_num = data[0]; + data++; + remaining_size--; + + lead_num = data[0]; + data++; + remaining_size--; + lead = (lead_num % 2 == 0 ? ONIG_REGSET_POSITION_LEAD : ONIG_REGSET_REGEX_LEAD); + + if (remaining_size < reg_num * 2) { + reg_num = reg_num % 15; // zero is OK. + } + + init_reg_num %= (reg_num + 1); + + if (MaxRegNum < reg_num) + MaxRegNum = reg_num; + + if (MaxInitRegNum < init_reg_num) + MaxInitRegNum = init_reg_num; + + if (reg_num == 0) + pattern_size = 1; + else + pattern_size = remaining_size / (reg_num * 2); + + if (pattern_size > MAX_PATTERN_SIZE) + pattern_size = MAX_PATTERN_SIZE; + + len = pattern_size * reg_num; + if (len == 0) len = 1; + + for (i = 0; i < reg_num; i++) { + pat[i] = (unsigned char* )malloc(pattern_size); + memcpy(pat[i], data, pattern_size); + pat_end[i] = pat[i] + pattern_size; + data += pattern_size; + remaining_size -= pattern_size; + } + + unsigned char *str = (unsigned char*)malloc(remaining_size != 0 ? remaining_size : 1); + memcpy(str, data, remaining_size); + str_null_end = str + remaining_size; + +#ifdef WITH_READ_MAIN + fprintf(stdout, "reg num: %d, pattern size: %d, lead: %s\n", + reg_num, pattern_size, + lead == ONIG_REGSET_POSITION_LEAD ? "position" : "regex"); + + if (reg_num != 0) { + unsigned char* p; + i = 0; + p = pat[0]; + while (p < pat_end[0]) { + fprintf(stdout, " 0x%02x", (int )*p++); + i++; + if (i % 8 == 0) fprintf(stdout, "\n"); + } + fprintf(stdout, "\n"); + } +#endif + + ENC = ONIG_ENCODING_UTF8; + + r = exec(ENC, reg_num, init_reg_num, pat, pat_end, lead, str, str_null_end); + + for (i = 0; i < reg_num; i++) { + free(pat[i]); + } + free(str); + + if (r == -2) { + //output_data("parser-bug", Data, Size); + exit(-2); + } + + if (EXEC_COUNT_INTERVAL == EXEC_PRINT_INTERVAL) { + char d[64]; + time_t t; + float fexec, freg, fvalid; + + t = time(NULL); + strftime(d, sizeof(d), "%m/%d %H:%M:%S", localtime(&t)); + + fexec = (float )EXEC_COUNT / INPUT_COUNT; + freg = (float )REGEX_SUCCESS_COUNT / INPUT_COUNT; + fvalid = (float )VALID_STRING_COUNT / INPUT_COUNT; + + fprintf(stdout, "%s: %ld: EXEC:%.2f, REG:%.2f, VALID:%.2f MAX REG:%d-%d\n", + d, EXEC_COUNT, fexec, freg, fvalid, MaxRegNum, MaxInitRegNum); + + EXEC_COUNT_INTERVAL = 0; + } + return r; +} + +#ifdef WITH_READ_MAIN + +extern int main(int argc, char* argv[]) +{ + size_t n; + uint8_t Data[10000]; + + n = read(0, Data, sizeof(Data)); + fprintf(stdout, "n: %ld\n", n); + LLVMFuzzerTestOneInput(Data, n); + + return 0; +} +#endif /* WITH_READ_MAIN */ diff --git a/harnesses/syntax-harness.c b/harnesses/syntax-harness.c deleted file mode 100644 index 0fb3587..0000000 --- a/harnesses/syntax-harness.c +++ /dev/null @@ -1,120 +0,0 @@ -/* - * syntax-harness.c - * contributed by Mark Griffin - */ -#include <stdio.h> -#include <string.h> -#include "oniguruma.h" - -#include <stdlib.h> - -#define DEFAULT_LIMIT 120 -typedef unsigned char uint8_t; - -extern int exec(OnigSyntaxType* syntax, char* apattern, char* astr) -{ - int r; - unsigned char *start, *range, *end; - regex_t* reg; - OnigErrorInfo einfo; - OnigRegion *region; - UChar* pattern = (UChar* )apattern; - UChar* str = (UChar* )astr; - - r = onig_new(®, pattern, pattern + strlen((char* )pattern), - ONIG_OPTION_DEFAULT, ONIG_ENCODING_ASCII, syntax, &einfo); - if (r != ONIG_NORMAL) { - char s[ONIG_MAX_ERROR_MESSAGE_LEN]; - onig_error_code_to_str((UChar* )s, r, &einfo); - fprintf(stdout, "ERROR: %s\n", s); - return -1; - } - - region = onig_region_new(); - - end = str + strlen((char* )str); - start = str; - range = end; - r = onig_search(reg, str, end, start, range, region, ONIG_OPTION_NONE); - if (r >= 0) { - int i; - - fprintf(stdout, "match at %d\n", r); - for (i = 0; i < region->num_regs; i++) { - fprintf(stdout, "%d: (%d-%d)\n", i, region->beg[i], region->end[i]); - } - } - else if (r == ONIG_MISMATCH) { - fprintf(stdout, "search fail\n"); - } - else { /* error */ - char s[ONIG_MAX_ERROR_MESSAGE_LEN]; - onig_error_code_to_str((UChar* )s, r); - fprintf(stdout, "ERROR: %s\n", s); - onig_region_free(region, 1 /* 1:free self, 0:free contents only */); - onig_free(reg); - return -1; - } - - onig_region_free(region, 1 /* 1:free self, 0:free contents only */); - onig_free(reg); - return 0; -} - -#define PATTERN_SIZE 64 -#define NUM_CONTROL_BYTES 1 -#define MIN_STR_SIZE 1 -int LLVMFuzzerTestOneInput(const uint8_t * Data, size_t Size) -{ - if (Size <= (NUM_CONTROL_BYTES + PATTERN_SIZE + MIN_STR_SIZE)) - return 0; - if (Size > 0x1000) - return 0; - size_t remaining_size = Size; - unsigned char *data = (unsigned char *)(Data); - - // pull off one byte to switch syntax choice - unsigned char syntax_choice = data[0]; - data++; - remaining_size--; - - // copy first PATTERN_SIZE bytes off to be the pattern - unsigned char *pattern = (unsigned char *)malloc(PATTERN_SIZE+1); - memset(pattern, 0, PATTERN_SIZE+1); - memcpy(pattern, data, PATTERN_SIZE); - data += PATTERN_SIZE; - remaining_size -= PATTERN_SIZE; - - unsigned char *str = (unsigned char*)malloc(remaining_size+1); - memset(str, 0, remaining_size+1); - memcpy(str, data, remaining_size); - - OnigEncoding use_encs[] = { ONIG_ENCODING_ASCII }; - onig_initialize(use_encs, sizeof(use_encs)/sizeof(use_encs[0])); - - onig_set_retry_limit_in_match(DEFAULT_LIMIT); - onig_set_parse_depth_limit(DEFAULT_LIMIT); - - OnigSyntaxType *syntaxes[] = { - ONIG_SYNTAX_POSIX_EXTENDED, - ONIG_SYNTAX_EMACS, - ONIG_SYNTAX_GREP, - ONIG_SYNTAX_GNU_REGEX, - ONIG_SYNTAX_JAVA, - ONIG_SYNTAX_PERL_NG, - ONIG_SYNTAX_RUBY, - ONIG_SYNTAX_ONIGURUMA, - }; - OnigSyntaxType *syntax = syntaxes[syntax_choice % 8]; - - int r; - r = exec(syntax, (char *)pattern, (char *)str); - // r = exec(ONIG_SYNTAX_JAVA, "\\p{XDigit}\\P{XDigit}[a-c&&b-g]", "bgc"); - - onig_end(); - - free(pattern); - free(str); - - return 0; -} diff --git a/sample/Makefile.am b/sample/Makefile.am index 320afcf..22a4989 100644 --- a/sample/Makefile.am +++ b/sample/Makefile.am @@ -6,7 +6,11 @@ LDADD = $(lib_onig) AM_LDFLAGS = -L$(prefix)/lib AM_CPPFLAGS = -I$(top_srcdir)/src -TESTS = encode listcap names posix simple sql syntax user_property callout echo count bug_fix +if ENABLE_POSIX_API +TESTS = encode listcap names posix simple sql syntax user_property callout echo count bug_fix regset +else +TESTS = encode listcap names simple sql syntax user_property callout echo count bug_fix regset +endif check_PROGRAMS = $(TESTS) @@ -22,6 +26,7 @@ callout_SOURCES = callout.c echo_SOURCES = echo.c count_SOURCES = count.c bug_fix = bug_fix.c +regset_SOURCES = regset.c sampledir = . @@ -29,7 +34,9 @@ test: $(TESTS) $(sampledir)/encode $(sampledir)/listcap $(sampledir)/names +if ENABLE_POSIX_API $(sampledir)/posix +endif $(sampledir)/simple $(sampledir)/sql $(sampledir)/syntax @@ -38,3 +45,4 @@ test: $(TESTS) $(sampledir)/echo $(sampledir)/count $(sampledir)/bug_fix + $(sampledir)/regset diff --git a/sample/bug_fix.c b/sample/bug_fix.c index 3f60c5b..f295bfd 100644 --- a/sample/bug_fix.c +++ b/sample/bug_fix.c @@ -81,7 +81,7 @@ extern int main(int argc, char* argv[]) /* fix ignore case in look-behind commit: 3340ec2cc5627172665303fe248c9793354d2251 */ exec(ONIG_ENCODING_UTF8, ONIG_OPTION_IGNORECASE, - "(?<=\305\211)a", "\312\274na"); /* \u{0149}a \u{02bc}na */ + "\305\211a", "\312\274na"); /* \u{0149}a \u{02bc}na */ exec(ONIG_ENCODING_UTF8, ONIG_OPTION_NONE, "(\\2)(\\1)", "aa"); /* fail. */ diff --git a/sample/regset.c b/sample/regset.c new file mode 100644 index 0000000..ca3a10c --- /dev/null +++ b/sample/regset.c @@ -0,0 +1,94 @@ +/* + * regset.c + */ +#include <stdio.h> +#include <string.h> +#include "oniguruma.h" + +extern int main(int argc, char* argv[]) +{ + int r; + int i, n; + int match_pos; + unsigned char *start, *range, *end; + OnigRegSet* set; + OnigRegSetLead lead; + regex_t* reg; + OnigErrorInfo einfo; + char ebuf[ONIG_MAX_ERROR_MESSAGE_LEN]; + + static UChar* str = (UChar* )"aaaaaaaaaaaaaaaaaaaaaaca"; + + static char* pat[] = { + "a(.*)b|a(.)c", + "^(abc)", + "a(.....)c" + }; + + OnigEncoding use_encs[] = { ONIG_ENCODING_UTF8 }; + onig_initialize(use_encs, sizeof(use_encs)/sizeof(use_encs[0])); + + r = onig_regset_new(&set, 0, NULL); + if (r != ONIG_NORMAL) { + onig_error_code_to_str((UChar* )ebuf, r); + fprintf(stderr, "ERROR: %s\n", ebuf); + onig_end(); + return -1; + } + + n = sizeof(pat) / sizeof(pat[0]); + + for (i = 0; i < n; i++) { + r = onig_new(®, (UChar* )pat[i], (UChar* )(pat[i] + strlen(pat[i])), + ONIG_OPTION_DEFAULT, ONIG_ENCODING_UTF8, ONIG_SYNTAX_DEFAULT, + &einfo); + if (r != ONIG_NORMAL) { + onig_error_code_to_str((UChar* )ebuf, r, &einfo); + fprintf(stderr, "ERROR: %s\n", ebuf); + onig_regset_free(set); + onig_end(); + return -1; + } + + r = onig_regset_add(set, reg); + if (r != ONIG_NORMAL) { + onig_free(reg); + onig_regset_free(set); + onig_end(); + return -1; + } + } + + end = str + strlen((char* )str); + start = str; + range = end; + lead = ONIG_REGSET_POSITION_LEAD; + //lead = ONIG_REGSET_PRIORITY_TO_REGEX_ORDER; + r = onig_regset_search(set, str, end, start, range, lead, ONIG_OPTION_NONE, + &match_pos); + if (r >= 0) { + OnigRegion *region; + + fprintf(stderr, "match regex index: %d\n", r); + fprintf(stderr, "match position: %d\n", match_pos); + + region = onig_regset_get_region(set, r); + for (i = 0; i < region->num_regs; i++) { + fprintf(stderr, "%d: (%d-%d)\n", i, region->beg[i], region->end[i]); + } + } + else if (r == ONIG_MISMATCH) { + fprintf(stderr, "search fail\n"); + } + else { /* error */ + onig_error_code_to_str((UChar* )ebuf, r); + fprintf(stderr, "ERROR: %s\n", ebuf); + onig_regset_free(set); + onig_end(); + return -1; + } + + onig_regset_free(set); + onig_end(); + return 0; +} diff --git a/src/Makefile.windows b/src/Makefile.windows index 762cf07..1e87504 100644 --- a/src/Makefile.windows +++ b/src/Makefile.windows @@ -2,6 +2,9 @@ product_name = oniguruma +TEST_DIR = $(ONIG_DIR)/../test +WIN_DIR = $(ONIG_DIR)/../windows + CPPFLAGS = CFLAGS = -O2 -nologo /W3 LDFLAGS = @@ -152,25 +155,24 @@ $(BUILD_DIR)/unicode_fold1_key.obj: $(ONIG_DIR)/unicode_fold1_key.c $(ONIG_DIR)/ $(BUILD_DIR)/unicode_fold2_key.obj: $(ONIG_DIR)/unicode_fold2_key.c $(ONIG_DIR)/regenc.h $(BUILD_DIR)/config.h $(BUILD_DIR)/unicode_fold3_key.obj: $(ONIG_DIR)/unicode_fold3_key.c $(ONIG_DIR)/regenc.h $(BUILD_DIR)/config.h -# C library test -ctest: $(testc) - .\$(testc) -# POSIX C library test -ptest: $(testp) - .\$(testp) +test_regset: $(TEST_DIR)/test_regset.c $(libname) + $(CC) -nologo /Fe:$@ /I. /I$(ONIG_DIR) /DONIG_EXTERN=extern /utf-8 $(TEST_DIR)/test_regset.c $(libname) + +test_utf8: $(TEST_DIR)/test_utf8.c $(libname) + $(CC) -nologo /Fe:$@ /I. /I$(ONIG_DIR) /DONIG_EXTERN=extern /utf-8 $(TEST_DIR)/test_utf8.c $(libname) -$(testc): $(testc).c $(libname) - $(CC) -nologo /Fe:$(testc) -DONIG_EXTERN=extern $(testc).c $(libname) +testc: $(WIN_DIR)/testc.c $(libname) + $(CC) -nologo /Fe:$@ /I. /I$(ONIG_DIR) /DONIG_EXTERN=extern $(WIN_DIR)/testc.c $(libname) -$(testp): $(testc).c $(dlllib) - $(CC) -nologo -DPOSIX_TEST /Fe:$(testp) $(testc).c $(dlllib) +testp: $(WIN_DIR)/testc.c $(libname) + $(CC) -nologo /Fe:$@ /I. /I$(ONIG_DIR) /DONIG_EXTERN=extern /DPOSIX_TEST $(WIN_DIR)/testc.c $(libname) -$(testc)u: $(testc)u.c $(libname) - $(CC) -nologo /Fe:$(testc)u -DONIG_EXTERN=extern $(testc)u.c $(libname) +testu: $(TEST_DIR)/testu.c $(libname) + $(CC) -nologo /Fe:$@ /I. /I$(ONIG_DIR) /DONIG_EXTERN=extern $(TEST_DIR)/testu.c $(libname) clean: - del $(BUILD_DIR)\*.obj $(BUILD_DIR)\*.lib $(BUILD_DIR)\*.exp $(BUILD_DIR)\*.dll $(BUILD_DIR)\$(testp).exe $(BUILD_DIR)\$(testc).exe $(BUILD_DIR)\$(testc).obj + del $(BUILD_DIR)\*.obj $(BUILD_DIR)\*.lib $(BUILD_DIR)\*.exp $(BUILD_DIR)\*.dll $(BUILD_DIR)\test_regset.exe $(BUILD_DIR)\test_utf8.exe $(BUILD_DIR)\testp.exe $(BUILD_DIR)\testc.exe $(BUILD_DIR)\testu.exe samples: all diff --git a/src/ascii.c b/src/ascii.c index e83e4d6..f2dc0d3 100644 --- a/src/ascii.c +++ b/src/ascii.c @@ -2,7 +2,7 @@ ascii.c - Oniguruma (regular expression library) **********************************************************************/ /*- - * Copyright (c) 2002-2018 K.Kosako <sndgk393 AT ybb DOT ne DOT jp> + * Copyright (c) 2002-2019 K.Kosako * All rights reserved. * * Redistribution and use in source and binary forms, with or without @@ -2,7 +2,7 @@ big5.c - Oniguruma (regular expression library) **********************************************************************/ /*- - * Copyright (c) 2002-2018 K.Kosako <sndgk393 AT ybb DOT ne DOT jp> + * Copyright (c) 2002-2019 K.Kosako * All rights reserved. * * Redistribution and use in source and binary forms, with or without @@ -55,6 +55,16 @@ big5_mbc_enc_len(const UChar* p) } static int +big5_code_to_mbclen(OnigCodePoint code) +{ + if ((code & (~0xffff)) != 0) return ONIGERR_INVALID_CODE_POINT_VALUE; + if ((code & 0xff00) != 0) return 2; + if (EncLen_BIG5[(int )(code & 0xff)] == 1) return 1; + + return ONIGERR_INVALID_CODE_POINT_VALUE; +} + +static int is_valid_mbc_string(const UChar* p, const UChar* end) { while (p < end) { @@ -99,15 +109,6 @@ big5_mbc_case_fold(OnigCaseFoldType flag, const UChar** pp, const UChar* end, pp, end, lower); } -#if 0 -static int -big5_is_mbc_ambiguous(OnigCaseFoldType flag, - const UChar** pp, const UChar* end) -{ - return onigenc_mbn_is_mbc_ambiguous(ONIG_ENCODING_BIG5, flag, pp, end); -} -#endif - static int big5_is_code_ctype(OnigCodePoint code, unsigned int ctype) { @@ -174,7 +175,7 @@ OnigEncodingType OnigEncodingBIG5 = { 1, /* min enc length */ onigenc_is_mbc_newline_0x0a, big5_mbc_to_code, - onigenc_mb2_code_to_mbclen, + big5_code_to_mbclen, big5_code_to_mbc, big5_mbc_case_fold, onigenc_ascii_apply_all_case_fold, diff --git a/src/config.h.win32 b/src/config.h.win32 index 1f848e2..82a35b9 100644 --- a/src/config.h.win32 +++ b/src/config.h.win32 @@ -1,3 +1,9 @@ +#if defined(__MINGW32__) || _MSC_VER >= 1600 +#define HAVE_STDINT_H 1 +#endif +#if defined(__MINGW32__) || _MSC_VER >= 1800 +#define HAVE_INTTYPES_H 1 +#endif #define HAVE_SYS_TYPES_H 1 #define HAVE_SYS_STAT_H 1 #define HAVE_MEMORY_H 1 diff --git a/src/config.h.win64 b/src/config.h.win64 index f72671b..7f19699 100644 --- a/src/config.h.win64 +++ b/src/config.h.win64 @@ -1,3 +1,9 @@ +#if defined(__MINGW32__) || _MSC_VER >= 1600 +#define HAVE_STDINT_H 1 +#endif +#if defined(__MINGW32__) || _MSC_VER >= 1800 +#define HAVE_INTTYPES_H 1 +#endif #define HAVE_SYS_TYPES_H 1 #define HAVE_SYS_STAT_H 1 #define HAVE_MEMORY_H 1 diff --git a/src/config.h.windows.in b/src/config.h.windows.in index d8de1dd..d4f73d7 100644 --- a/src/config.h.windows.in +++ b/src/config.h.windows.in @@ -1,7 +1,14 @@ +#if defined(__MINGW32__) || _MSC_VER >= 1600 +#define HAVE_STDINT_H 1 +#endif +#if defined(__MINGW32__) || _MSC_VER >= 1800 +#define HAVE_INTTYPES_H 1 +#endif #define HAVE_SYS_TYPES_H 1 #define HAVE_SYS_STAT_H 1 #define HAVE_MEMORY_H 1 #define HAVE_OFF_T 1 + #define SIZEOF_INT 4 #define SIZEOF_LONG 4 #define SIZEOF_LONG_LONG 8 diff --git a/src/cp1251.c b/src/cp1251.c index b4ce4d8..fa20780 100644 --- a/src/cp1251.c +++ b/src/cp1251.c @@ -2,8 +2,8 @@ cp1251.c - Oniguruma (regular expression library) **********************************************************************/ /*- - * Copyright (c) 2006-2018 Byte <byte AT mail DOT kna DOT ru> - * K.Kosako <sndgk393 AT ybb DOT ne DOT jp> + * Copyright (c) 2006-2019 Byte <byte AT mail DOT kna DOT ru> + * K.Kosako * All rights reserved. * * Redistribution and use in source and binary forms, with or without diff --git a/src/euc_jp.c b/src/euc_jp.c index d17386d..640b3e3 100644 --- a/src/euc_jp.c +++ b/src/euc_jp.c @@ -2,7 +2,7 @@ euc_jp.c - Oniguruma (regular expression library) **********************************************************************/ /*- - * Copyright (c) 2002-2018 K.Kosako <sndgk393 AT ybb DOT ne DOT jp> + * Copyright (c) 2002-2019 K.Kosako * All rights reserved. * * Redistribution and use in source and binary forms, with or without @@ -120,25 +120,6 @@ code_to_mbclen(OnigCodePoint code) return ONIGERR_INVALID_CODE_POINT_VALUE; } -#if 0 -static int -code_to_mbc_first(OnigCodePoint code) -{ - int first; - - if ((code & 0xff0000) != 0) { - first = (code >> 16) & 0xff; - } - else if ((code & 0xff00) != 0) { - first = (code >> 8) & 0xff; - } - else { - return (int )code; - } - return first; -} -#endif - static int code_to_mbc(OnigCodePoint code, UChar *buf) { diff --git a/src/euc_jp_prop.c b/src/euc_jp_prop.c index be719cf..a816f48 100644 --- a/src/euc_jp_prop.c +++ b/src/euc_jp_prop.c @@ -1,5 +1,5 @@ /* ANSI-C code produced by gperf version 3.1 */ -/* Command-line: /usr/local/bin/gperf -pt -T -L ANSI-C -N onigenc_euc_jp_lookup_property_name --output-file gperf1.tmp euc_jp_prop.gperf */ +/* Command-line: gperf -pt -T -L ANSI-C -N onigenc_euc_jp_lookup_property_name --output-file gperf1.tmp euc_jp_prop.gperf */ /* Computed positions: -k'1,3' */ #if !((' ' == 32) && ('!' == 33) && ('"' == 34) && ('#' == 35) \ diff --git a/src/euc_kr.c b/src/euc_kr.c index bb968b0..7fa50af 100644 --- a/src/euc_kr.c +++ b/src/euc_kr.c @@ -2,7 +2,7 @@ euc_kr.c - Oniguruma (regular expression library) **********************************************************************/ /*- - * Copyright (c) 2002-2018 K.Kosako <sndgk393 AT ybb DOT ne DOT jp> + * Copyright (c) 2002-2019 K.Kosako * All rights reserved. * * Redistribution and use in source and binary forms, with or without @@ -55,6 +55,16 @@ euckr_mbc_enc_len(const UChar* p) } static int +euckr_code_to_mbclen(OnigCodePoint code) +{ + if ((code & (~0xffff)) != 0) return ONIGERR_INVALID_CODE_POINT_VALUE; + if ((code & 0xff00) != 0) return 2; + if (EncLen_EUCKR[(int )(code & 0xff)] == 1) return 1; + + return ONIGERR_INVALID_CODE_POINT_VALUE; +} + +static int is_valid_mbc_string(const UChar* p, const UChar* end) { while (p < end) { @@ -98,15 +108,6 @@ euckr_mbc_case_fold(OnigCaseFoldType flag, const UChar** pp, const UChar* end, pp, end, lower); } -#if 0 -static int -euckr_is_mbc_ambiguous(OnigCaseFoldType flag, - const UChar** pp, const UChar* end) -{ - return onigenc_mbn_is_mbc_ambiguous(ONIG_ENCODING_EUC_KR, flag, pp, end); -} -#endif - static int euckr_is_code_ctype(OnigCodePoint code, unsigned int ctype) { @@ -149,7 +150,7 @@ OnigEncodingType OnigEncodingEUC_KR = { 1, /* min enc length */ onigenc_is_mbc_newline_0x0a, euckr_mbc_to_code, - onigenc_mb2_code_to_mbclen, + euckr_code_to_mbclen, euckr_code_to_mbc, euckr_mbc_case_fold, onigenc_ascii_apply_all_case_fold, @@ -174,7 +175,7 @@ OnigEncodingType OnigEncodingEUC_CN = { 1, /* min enc length */ onigenc_is_mbc_newline_0x0a, euckr_mbc_to_code, - onigenc_mb2_code_to_mbclen, + euckr_code_to_mbclen, euckr_code_to_mbc, euckr_mbc_case_fold, onigenc_ascii_apply_all_case_fold, diff --git a/src/euc_tw.c b/src/euc_tw.c index c9acaf1..8e72b97 100644 --- a/src/euc_tw.c +++ b/src/euc_tw.c @@ -2,7 +2,7 @@ euc_tw.c - Oniguruma (regular expression library) **********************************************************************/ /*- - * Copyright (c) 2002-2018 K.Kosako <sndgk393 AT ybb DOT ne DOT jp> + * Copyright (c) 2002-2019 K.Kosako * All rights reserved. * * Redistribution and use in source and binary forms, with or without @@ -55,6 +55,20 @@ euctw_mbc_enc_len(const UChar* p) } static int +euctw_code_to_mbclen(OnigCodePoint code) +{ + if ((code & 0xff000000) != 0) return 4; + else if ((code & 0xff0000) != 0) return ONIGERR_INVALID_CODE_POINT_VALUE; + else if ((code & 0xff00) != 0) return 2; + else { + if (EncLen_EUCTW[(int )(code & 0xff)] == 1) + return 1; + + return ONIGERR_INVALID_CODE_POINT_VALUE; + } +} + +static int is_valid_mbc_string(const UChar* p, const UChar* end) { while (p < end) { @@ -155,7 +169,7 @@ OnigEncodingType OnigEncodingEUC_TW = { 1, /* min enc length */ onigenc_is_mbc_newline_0x0a, euctw_mbc_to_code, - onigenc_mb4_code_to_mbclen, + euctw_code_to_mbclen, euctw_code_to_mbc, euctw_mbc_case_fold, onigenc_ascii_apply_all_case_fold, diff --git a/src/gb18030.c b/src/gb18030.c index 8d415b0..50898eb 100644 --- a/src/gb18030.c +++ b/src/gb18030.c @@ -3,7 +3,7 @@ **********************************************************************/ /*- * Copyright (c) 2005-2019 KUBO Takehiro <kubo AT jiubao DOT org> - * K.Kosako <sndgk393 AT ybb DOT ne DOT jp> + * K.Kosako * All rights reserved. * * Redistribution and use in source and binary forms, with or without @@ -33,6 +33,7 @@ #if 1 #define DEBUG_GB18030(arg) #else +#include <stdio.h> #define DEBUG_GB18030(arg) printf arg #endif @@ -76,6 +77,20 @@ gb18030_mbc_enc_len(const UChar* p) } static int +gb18030_code_to_mbclen(OnigCodePoint code) +{ + if ((code & 0xff000000) != 0) return 4; + else if ((code & 0xff0000) != 0) return ONIGERR_INVALID_CODE_POINT_VALUE; + else if ((code & 0xff00) != 0) return 2; + else { + if (GB18030_MAP[(int )(code & 0xff)] == CM) + return ONIGERR_INVALID_CODE_POINT_VALUE; + + return 1; + } +} + +static int is_valid_mbc_string(const UChar* p, const UChar* end) { while (p < end) { @@ -135,15 +150,6 @@ gb18030_mbc_case_fold(OnigCaseFoldType flag, const UChar** pp, const UChar* end, pp, end, lower); } -#if 0 -static int -gb18030_is_mbc_ambiguous(OnigCaseFoldType flag, - const UChar** pp, const UChar* end) -{ - return onigenc_mbn_is_mbc_ambiguous(ONIG_ENCODING_GB18030, flag, pp, end); -} -#endif - static int gb18030_is_code_ctype(OnigCodePoint code, unsigned int ctype) { @@ -522,7 +528,7 @@ OnigEncodingType OnigEncodingGB18030 = { 1, /* min enc length */ onigenc_is_mbc_newline_0x0a, gb18030_mbc_to_code, - onigenc_mb4_code_to_mbclen, + gb18030_code_to_mbclen, gb18030_code_to_mbc, gb18030_mbc_case_fold, onigenc_ascii_apply_all_case_fold, diff --git a/src/gperf_fold_key_conv.py b/src/gperf_fold_key_conv.py index f453186..c633100 100755 --- a/src/gperf_fold_key_conv.py +++ b/src/gperf_fold_key_conv.py @@ -12,7 +12,7 @@ REG_STR_AT = re.compile('str\[(\d+)\]') REG_RETURN_TYPE = re.compile('^const\s+short\s+int\s*\*') REG_FOLD_KEY = re.compile('unicode_fold(\d)_key\s*\(register\s+const\s+char\s*\*\s*str,\s*register\s+size_t\s+len\)') REG_ENTRY = re.compile('\{".*?",\s*(-?\d+)\s*\}') -REG_IF_LEN = re.compile('if\s*\(\s*len\s*<=\s*MAX_WORD_LENGTH.+') +REG_IF_LEN = re.compile('\s*if\s*\(\s*len\s*<=\s*MAX_WORD_LENGTH.+') REG_GET_HASH = re.compile('(?:register\s+)?(?:unsigned\s+)?int\s+key\s*=\s*hash\s*\(str,\s*len\);') REG_GET_CODE = re.compile('(?:register\s+)?const\s+char\s*\*\s*s\s*=\s*wordlist\[key\]\.name;') REG_CODE_CHECK = re.compile('if\s*\(\*str\s*==\s*\*s\s*&&\s*!strncmp.+\)') @@ -34,7 +34,7 @@ def parse_line(s, key_len): if r != s: return r r = re.sub(REG_ENTRY, '\\1', s) if r != s: return r - r = re.sub(REG_IF_LEN, 'if (0 == 0)', s) + r = re.sub(REG_IF_LEN, '', s) if r != s: return r r = re.sub(REG_GET_HASH, 'int key = hash(codes);', s) if r != s: return r diff --git a/src/gperf_unfold_key_conv.py b/src/gperf_unfold_key_conv.py index 3cf4836..d999d4e 100755 --- a/src/gperf_unfold_key_conv.py +++ b/src/gperf_unfold_key_conv.py @@ -12,7 +12,7 @@ REG_STR_AT = re.compile('str\[(\d+)\]') REG_UNFOLD_KEY = re.compile('onigenc_unicode_unfold_key\s*\(register\s+const\s+char\s*\*\s*str,\s*register\s+size_t\s+len\)') REG_ENTRY = re.compile('\{".+?",\s*/\*(.+?)\*/\s*(-?\d+),\s*(\d)\}') REG_EMPTY_ENTRY = re.compile('\{"",\s*(-?\d+),\s*(\d)\}') -REG_IF_LEN = re.compile('if\s*\(\s*len\s*<=\s*MAX_WORD_LENGTH.+') +REG_IF_LEN = re.compile('\s*if\s*\(\s*len\s*<=\s*MAX_WORD_LENGTH.+') REG_GET_HASH = re.compile('(?:register\s+)?(?:unsigned\s+)?int\s+key\s*=\s*hash\s*\(str,\s*len\);') REG_GET_CODE = re.compile('(?:register\s+)?const\s+char\s*\*\s*s\s*=\s*wordlist\[key\]\.name;') REG_CODE_CHECK = re.compile('if\s*\(\*str\s*==\s*\*s\s*&&\s*!strncmp.+\)') @@ -32,7 +32,7 @@ def parse_line(s): if r != s: return r r = re.sub(REG_EMPTY_ENTRY, '{0xffffffff, \\1, \\2}', s) if r != s: return r - r = re.sub(REG_IF_LEN, 'if (0 == 0)', s) + r = re.sub(REG_IF_LEN, '', s) if r != s: return r r = re.sub(REG_GET_HASH, 'int key = hash(&code);', s) if r != s: return r diff --git a/src/iso8859_1.c b/src/iso8859_1.c index 3b64942..e681c2a 100644 --- a/src/iso8859_1.c +++ b/src/iso8859_1.c @@ -2,7 +2,7 @@ iso8859_1.c - Oniguruma (regular expression library) **********************************************************************/ /*- - * Copyright (c) 2002-2018 K.Kosako <sndgk393 AT ybb DOT ne DOT jp> + * Copyright (c) 2002-2019 K.Kosako * All rights reserved. * * Redistribution and use in source and binary forms, with or without @@ -216,32 +216,6 @@ mbc_case_fold(OnigCaseFoldType flag, const UChar** pp, return 1; } -#if 0 -static int -is_mbc_ambiguous(OnigCaseFoldType flag, const UChar** pp, const UChar* end) -{ - int v; - const UChar* p = *pp; - - if (*p == 0xdf && (flag & INTERNAL_ONIGENC_CASE_FOLD_MULTI_CHAR) != 0) { - (*pp)++; - return TRUE; - } - - (*pp)++; - v = (EncISO_8859_1_CtypeTable[*p] & (BIT_CTYPE_UPPER | BIT_CTYPE_LOWER)); - if ((v | BIT_CTYPE_LOWER) != 0) { - /* 0xdf, 0xaa, 0xb5, 0xba are lower case letter, but can't convert. */ - if (*p >= 0xaa && *p <= 0xba) - return FALSE; - else - return TRUE; - } - - return (v != 0 ? TRUE : FALSE); -} -#endif - static int is_code_ctype(OnigCodePoint code, unsigned int ctype) { diff --git a/src/iso8859_10.c b/src/iso8859_10.c index f5882bc..e98cffb 100644 --- a/src/iso8859_10.c +++ b/src/iso8859_10.c @@ -2,7 +2,7 @@ iso8859_10.c - Oniguruma (regular expression library) **********************************************************************/ /*- - * Copyright (c) 2002-2018 K.Kosako <sndgk393 AT ybb DOT ne DOT jp> + * Copyright (c) 2002-2019 K.Kosako * All rights reserved. * * Redistribution and use in source and binary forms, with or without @@ -121,28 +121,6 @@ mbc_case_fold(OnigCaseFoldType flag, return 1; } -#if 0 -static int -is_mbc_ambiguous(OnigCaseFoldType flag, const UChar** pp, const UChar* end) -{ - int v; - const UChar* p = *pp; - - if (*p == 0xdf && (flag & INTERNAL_ONIGENC_CASE_FOLD_MULTI_CHAR) != 0) { - (*pp)++; - return TRUE; - } - - (*pp)++; - v = (EncISO_8859_10_CtypeTable[*p] & (BIT_CTYPE_UPPER | BIT_CTYPE_LOWER)); - if ((v | BIT_CTYPE_LOWER) != 0) { - return TRUE; - } - - return (v != 0 ? TRUE : FALSE); -} -#endif - static int is_code_ctype(OnigCodePoint code, unsigned int ctype) { diff --git a/src/iso8859_11.c b/src/iso8859_11.c index da8fda0..8639ce2 100644 --- a/src/iso8859_11.c +++ b/src/iso8859_11.c @@ -2,7 +2,7 @@ iso8859_11.c - Oniguruma (regular expression library) **********************************************************************/ /*- - * Copyright (c) 2002-2018 K.Kosako <sndgk393 AT ybb DOT ne DOT jp> + * Copyright (c) 2002-2019 K.Kosako * All rights reserved. * * Redistribution and use in source and binary forms, with or without diff --git a/src/iso8859_13.c b/src/iso8859_13.c index 0cf251c..2bd460f 100644 --- a/src/iso8859_13.c +++ b/src/iso8859_13.c @@ -2,7 +2,7 @@ iso8859_13.c - Oniguruma (regular expression library) **********************************************************************/ /*- - * Copyright (c) 2002-2018 K.Kosako <sndgk393 AT ybb DOT ne DOT jp> + * Copyright (c) 2002-2019 K.Kosako * All rights reserved. * * Redistribution and use in source and binary forms, with or without @@ -121,32 +121,6 @@ mbc_case_fold(OnigCaseFoldType flag, return 1; } -#if 0 -static int -is_mbc_ambiguous(OnigCaseFoldType flag, const UChar** pp, const UChar* end) -{ - int v; - const UChar* p = *pp; - - if (*p == 0xdf && (flag & INTERNAL_ONIGENC_CASE_FOLD_MULTI_CHAR) != 0) { - (*pp)++; - return TRUE; - } - - (*pp)++; - v = (EncISO_8859_13_CtypeTable[*p] & (BIT_CTYPE_UPPER | BIT_CTYPE_LOWER)); - if ((v | BIT_CTYPE_LOWER) != 0) { - /* 0xdf, 0xb5 are lower case letter, but can't convert. */ - if (*p == 0xb5) - return FALSE; - else - return TRUE; - } - - return (v != 0 ? TRUE : FALSE); -} -#endif - static int is_code_ctype(OnigCodePoint code, unsigned int ctype) { diff --git a/src/iso8859_14.c b/src/iso8859_14.c index 030e9f5..5030b55 100644 --- a/src/iso8859_14.c +++ b/src/iso8859_14.c @@ -2,7 +2,7 @@ iso8859_14.c - Oniguruma (regular expression library) **********************************************************************/ /*- - * Copyright (c) 2002-2018 K.Kosako <sndgk393 AT ybb DOT ne DOT jp> + * Copyright (c) 2002-2019 K.Kosako * All rights reserved. * * Redistribution and use in source and binary forms, with or without @@ -121,29 +121,6 @@ mbc_case_fold(OnigCaseFoldType flag, return 1; /* return byte length of converted char to lower */ } -#if 0 -static int -is_mbc_ambiguous(OnigCaseFoldType flag, - const UChar** pp, const UChar* end) -{ - int v; - const UChar* p = *pp; - - if (*p == 0xdf && (flag & INTERNAL_ONIGENC_CASE_FOLD_MULTI_CHAR) != 0) { - (*pp)++; - return TRUE; - } - - (*pp)++; - v = (EncISO_8859_14_CtypeTable[*p] & (BIT_CTYPE_UPPER | BIT_CTYPE_LOWER)); - if ((v | BIT_CTYPE_LOWER) != 0) { - return TRUE; - } - - return (v != 0 ? TRUE : FALSE); -} -#endif - static int is_code_ctype(OnigCodePoint code, unsigned int ctype) { diff --git a/src/iso8859_15.c b/src/iso8859_15.c index 859d727..f32c3de 100644 --- a/src/iso8859_15.c +++ b/src/iso8859_15.c @@ -2,7 +2,7 @@ iso8859_15.c - Oniguruma (regular expression library) **********************************************************************/ /*- - * Copyright (c) 2002-2018 K.Kosako <sndgk393 AT ybb DOT ne DOT jp> + * Copyright (c) 2002-2019 K.Kosako * All rights reserved. * * Redistribution and use in source and binary forms, with or without @@ -121,32 +121,6 @@ mbc_case_fold(OnigCaseFoldType flag, return 1; /* return byte length of converted char to lower */ } -#if 0 -static int -is_mbc_ambiguous(OnigCaseFoldType flag, const UChar** pp, const UChar* end) -{ - int v; - const UChar* p = *pp; - - if (*p == 0xdf && (flag & INTERNAL_ONIGENC_CASE_FOLD_MULTI_CHAR) != 0) { - (*pp)++; - return TRUE; - } - - (*pp)++; - v = (EncISO_8859_15_CtypeTable[*p] & (BIT_CTYPE_UPPER | BIT_CTYPE_LOWER)); - if ((v | BIT_CTYPE_LOWER) != 0) { - /* 0xdf etc.. are lower case letter, but can't convert. */ - if (*p == 0xaa || *p == 0xb5 || *p == 0xba) - return FALSE; - else - return TRUE; - } - - return (v != 0 ? TRUE : FALSE); -} -#endif - static int is_code_ctype(OnigCodePoint code, unsigned int ctype) { diff --git a/src/iso8859_16.c b/src/iso8859_16.c index 2614e56..22a653a 100644 --- a/src/iso8859_16.c +++ b/src/iso8859_16.c @@ -2,7 +2,7 @@ iso8859_16.c - Oniguruma (regular expression library) **********************************************************************/ /*- - * Copyright (c) 2002-2018 K.Kosako <sndgk393 AT ybb DOT ne DOT jp> + * Copyright (c) 2002-2019 K.Kosako * All rights reserved. * * Redistribution and use in source and binary forms, with or without @@ -121,28 +121,6 @@ mbc_case_fold(OnigCaseFoldType flag, return 1; /* return byte length of converted char to lower */ } -#if 0 -static int -is_mbc_ambiguous(OnigCaseFoldType flag, const UChar** pp, const UChar* end) -{ - int v; - const UChar* p = *pp; - - if (*p == 0xdf && (flag & INTERNAL_ONIGENC_CASE_FOLD_MULTI_CHAR) != 0) { - (*pp)++; - return TRUE; - } - - (*pp)++; - v = (EncISO_8859_16_CtypeTable[*p] & (BIT_CTYPE_UPPER | BIT_CTYPE_LOWER)); - if ((v | BIT_CTYPE_LOWER) != 0) { - return TRUE; - } - - return (v != 0 ? TRUE : FALSE); -} -#endif - static int is_code_ctype(OnigCodePoint code, unsigned int ctype) { diff --git a/src/iso8859_2.c b/src/iso8859_2.c index ba030d5..dc3d0a1 100644 --- a/src/iso8859_2.c +++ b/src/iso8859_2.c @@ -2,7 +2,7 @@ iso8859_2.c - Oniguruma (regular expression library) **********************************************************************/ /*- - * Copyright (c) 2002-2018 K.Kosako <sndgk393 AT ybb DOT ne DOT jp> + * Copyright (c) 2002-2019 K.Kosako * All rights reserved. * * Redistribution and use in source and binary forms, with or without @@ -121,28 +121,6 @@ mbc_case_fold(OnigCaseFoldType flag, return 1; /* return byte length of converted char to lower */ } -#if 0 -static int -is_mbc_ambiguous(OnigCaseFoldType flag, const UChar** pp, const UChar* end) -{ - int v; - const UChar* p = *pp; - - if (*p == 0xdf && (flag & INTERNAL_ONIGENC_CASE_FOLD_MULTI_CHAR) != 0) { - (*pp)++; - return TRUE; - } - - (*pp)++; - v = (EncISO_8859_2_CtypeTable[*p] & (BIT_CTYPE_UPPER | BIT_CTYPE_LOWER)); - if ((v | BIT_CTYPE_LOWER) != 0) { - return TRUE; - } - - return (v != 0 ? TRUE : FALSE); -} -#endif - static const OnigPairCaseFoldCodes CaseFoldMap[] = { { 0xa1, 0xb1 }, { 0xa3, 0xb3 }, diff --git a/src/iso8859_3.c b/src/iso8859_3.c index f090d0b..49dc6b2 100644 --- a/src/iso8859_3.c +++ b/src/iso8859_3.c @@ -2,7 +2,7 @@ iso8859_3.c - Oniguruma (regular expression library) **********************************************************************/ /*- - * Copyright (c) 2002-2018 K.Kosako <sndgk393 AT ybb DOT ne DOT jp> + * Copyright (c) 2002-2019 K.Kosako * All rights reserved. * * Redistribution and use in source and binary forms, with or without @@ -121,32 +121,6 @@ mbc_case_fold(OnigCaseFoldType flag, const UChar** pp, return 1; } -#if 0 -static int -is_mbc_ambiguous(OnigCaseFoldType flag, const UChar** pp, const UChar* end) -{ - int v; - const UChar* p = *pp; - - if (*p == 0xdf && (flag & INTERNAL_ONIGENC_CASE_FOLD_MULTI_CHAR) != 0) { - (*pp)++; - return TRUE; - } - - (*pp)++; - v = (EncISO_8859_3_CtypeTable[*p] & (BIT_CTYPE_UPPER | BIT_CTYPE_LOWER)); - if ((v | BIT_CTYPE_LOWER) != 0) { - /* 0xaa, 0xb5, 0xba are lower case letter, but can't convert. */ - if (*p == 0xb5) - return FALSE; - else - return TRUE; - } - - return (v != 0 ? TRUE : FALSE); -} -#endif - static int is_code_ctype(OnigCodePoint code, unsigned int ctype) { diff --git a/src/iso8859_4.c b/src/iso8859_4.c index 57dc9fe..f3f6ba9 100644 --- a/src/iso8859_4.c +++ b/src/iso8859_4.c @@ -2,7 +2,7 @@ iso8859_4.c - Oniguruma (regular expression library) **********************************************************************/ /*- - * Copyright (c) 2002-2018 K.Kosako <sndgk393 AT ybb DOT ne DOT jp> + * Copyright (c) 2002-2019 K.Kosako * All rights reserved. * * Redistribution and use in source and binary forms, with or without @@ -121,31 +121,6 @@ mbc_case_fold(OnigCaseFoldType flag, return 1; /* return byte length of converted char to lower */ } -#if 0 -static int -is_mbc_ambiguous(OnigCaseFoldType flag, const UChar** pp, const UChar* end) -{ - int v; - const UChar* p = *pp; - - if (*p == 0xdf && (flag & INTERNAL_ONIGENC_CASE_FOLD_MULTI_CHAR) != 0) { - (*pp)++; - return TRUE; - } - - (*pp)++; - v = (EncISO_8859_4_CtypeTable[*p] & (BIT_CTYPE_UPPER | BIT_CTYPE_LOWER)); - if ((v | BIT_CTYPE_LOWER) != 0) { - if (*p == 0xa2) - return FALSE; - else - return TRUE; - } - - return (v != 0 ? TRUE : FALSE); -} -#endif - static int is_code_ctype(OnigCodePoint code, unsigned int ctype) { diff --git a/src/iso8859_5.c b/src/iso8859_5.c index a090d25..a5f587c 100644 --- a/src/iso8859_5.c +++ b/src/iso8859_5.c @@ -2,7 +2,7 @@ iso8859_5.c - Oniguruma (regular expression library) **********************************************************************/ /*- - * Copyright (c) 2002-2018 K.Kosako <sndgk393 AT ybb DOT ne DOT jp> + * Copyright (c) 2002-2019 K.Kosako * All rights reserved. * * Redistribution and use in source and binary forms, with or without @@ -114,19 +114,6 @@ mbc_case_fold(OnigCaseFoldType flag ARG_UNUSED, return 1; } -#if 0 -static int -is_mbc_ambiguous(OnigCaseFoldType flag, const UChar** pp, const UChar* end) -{ - int v; - const UChar* p = *pp; - - (*pp)++; - v = (EncISO_8859_5_CtypeTable[*p] & (BIT_CTYPE_UPPER | BIT_CTYPE_LOWER)); - return (v != 0 ? TRUE : FALSE); -} -#endif - static int is_code_ctype(OnigCodePoint code, unsigned int ctype) { diff --git a/src/iso8859_6.c b/src/iso8859_6.c index 1c16c79..fb72442 100644 --- a/src/iso8859_6.c +++ b/src/iso8859_6.c @@ -2,7 +2,7 @@ iso8859_6.c - Oniguruma (regular expression library) **********************************************************************/ /*- - * Copyright (c) 2002-2018 K.Kosako <sndgk393 AT ybb DOT ne DOT jp> + * Copyright (c) 2002-2019 K.Kosako * All rights reserved. * * Redistribution and use in source and binary forms, with or without diff --git a/src/iso8859_7.c b/src/iso8859_7.c index 8c88351..018efac 100644 --- a/src/iso8859_7.c +++ b/src/iso8859_7.c @@ -2,7 +2,7 @@ iso8859_7.c - Oniguruma (regular expression library) **********************************************************************/ /*- - * Copyright (c) 2002-2018 K.Kosako <sndgk393 AT ybb DOT ne DOT jp> + * Copyright (c) 2002-2019 K.Kosako * All rights reserved. * * Redistribution and use in source and binary forms, with or without @@ -114,26 +114,6 @@ mbc_case_fold(OnigCaseFoldType flag ARG_UNUSED, return 1; } -#if 0 -static int -is_mbc_ambiguous(OnigCaseFoldType flag, const UChar** pp, const UChar* end) -{ - int v; - const UChar* p = *pp; - - (*pp)++; - v = (EncISO_8859_7_CtypeTable[*p] & (BIT_CTYPE_UPPER | BIT_CTYPE_LOWER)); - if ((v | BIT_CTYPE_LOWER) != 0) { - if (*p == 0xc0 || *p == 0xe0) - return FALSE; - else - return TRUE; - } - - return (v != 0 ? TRUE : FALSE); -} -#endif - static int is_code_ctype(OnigCodePoint code, unsigned int ctype) { diff --git a/src/iso8859_8.c b/src/iso8859_8.c index bd3e94d..92a5eb1 100644 --- a/src/iso8859_8.c +++ b/src/iso8859_8.c @@ -2,7 +2,7 @@ iso8859_8.c - Oniguruma (regular expression library) **********************************************************************/ /*- - * Copyright (c) 2002-2018 K.Kosako <sndgk393 AT ybb DOT ne DOT jp> + * Copyright (c) 2002-2019 K.Kosako * All rights reserved. * * Redistribution and use in source and binary forms, with or without diff --git a/src/iso8859_9.c b/src/iso8859_9.c index 1d291d5..1f9bdea 100644 --- a/src/iso8859_9.c +++ b/src/iso8859_9.c @@ -2,7 +2,7 @@ iso8859_9.c - Oniguruma (regular expression library) **********************************************************************/ /*- - * Copyright (c) 2002-2018 K.Kosako <sndgk393 AT ybb DOT ne DOT jp> + * Copyright (c) 2002-2019 K.Kosako * All rights reserved. * * Redistribution and use in source and binary forms, with or without @@ -121,32 +121,6 @@ mbc_case_fold(OnigCaseFoldType flag, return 1; } -#if 0 -static int -is_mbc_ambiguous(OnigCaseFoldType flag, const UChar** pp, const UChar* end) -{ - int v; - const UChar* p = *pp; - - if (*p == 0xdf && (flag & INTERNAL_ONIGENC_CASE_FOLD_MULTI_CHAR) != 0) { - (*pp)++; - return TRUE; - } - - (*pp)++; - v = (EncISO_8859_9_CtypeTable[*p] & (BIT_CTYPE_UPPER | BIT_CTYPE_LOWER)); - if ((v | BIT_CTYPE_LOWER) != 0) { - /* 0xdf etc.. are lower case letter, but can't convert. */ - if (*p >= 0xaa && *p <= 0xba) - return FALSE; - else - return TRUE; - } - - return (v != 0 ? TRUE : FALSE); -} -#endif - static int is_code_ctype(OnigCodePoint code, unsigned int ctype) { @@ -2,7 +2,7 @@ koi8.c - Oniguruma (regular expression library) **********************************************************************/ /*- - * Copyright (c) 2002-2018 K.Kosako <sndgk393 AT ybb DOT ne DOT jp> + * Copyright (c) 2002-2019 K.Kosako * All rights reserved. * * Redistribution and use in source and binary forms, with or without @@ -115,25 +115,6 @@ koi8_mbc_case_fold(OnigCaseFoldType flag ARG_UNUSED, return 1; } -#if 0 -static int -koi8_is_mbc_ambiguous(OnigAmbigType flag, const OnigUChar** pp, const OnigUChar* end) -{ - const OnigUChar* p = *pp; - - (*pp)++; - if (((flag & ONIGENC_CASE_FOLD_ASCII_CASE) != 0 && - ONIGENC_IS_MBC_ASCII(p)) || - ((flag & ONIGENC_CASE_FOLD_NONASCII_CASE) != 0 && - !ONIGENC_IS_MBC_ASCII(p))) { - int v = (EncKOI8_CtypeTable[*p] & - (BIT_CTYPE_UPPER | BIT_CTYPE_LOWER)); - return (v != 0 ? TRUE : FALSE); - } - return FALSE; -} -#endif - static int koi8_is_code_ctype(OnigCodePoint code, unsigned int ctype) { diff --git a/src/koi8_r.c b/src/koi8_r.c index 1284f7f..c77302f 100644 --- a/src/koi8_r.c +++ b/src/koi8_r.c @@ -2,7 +2,7 @@ koi8_r.c - Oniguruma (regular expression library) **********************************************************************/ /*- - * Copyright (c) 2002-2018 K.Kosako <sndgk393 AT ybb DOT ne DOT jp> + * Copyright (c) 2002-2019 K.Kosako * All rights reserved. * * Redistribution and use in source and binary forms, with or without @@ -114,19 +114,6 @@ koi8_r_mbc_case_fold(OnigCaseFoldType flag ARG_UNUSED, return 1; } -#if 0 -static int -koi8_r_is_mbc_ambiguous(OnigCaseFoldType flag, const UChar** pp, const UChar* end) -{ - int v; - const UChar* p = *pp; - - (*pp)++; - v = (EncKOI8_R_CtypeTable[*p] & (BIT_CTYPE_UPPER | BIT_CTYPE_LOWER)); - return (v != 0 ? TRUE : FALSE); -} -#endif - static int koi8_r_is_code_ctype(OnigCodePoint code, unsigned int ctype) { diff --git a/src/make_property.sh b/src/make_property.sh index bc5cf98..cef0a96 100755 --- a/src/make_property.sh +++ b/src/make_property.sh @@ -1,8 +1,9 @@ #!/bin/sh +GPERF=gperf + TMP1=gperf1.tmp TMP2=gperf2.tmp -GPERF=/usr/local/bin/gperf GPERF_OPT='-pt -T -L ANSI-C' diff --git a/src/make_unicode_egcb_data.py b/src/make_unicode_egcb_data.py index 0f63f97..9c71796 100755 --- a/src/make_unicode_egcb_data.py +++ b/src/make_unicode_egcb_data.py @@ -1,7 +1,7 @@ #!/usr/bin/python # -*- coding: utf-8 -*- # make_unicode_egcb_data.py -# Copyright (c) 2017-2018 K.Kosako +# Copyright (c) 2017-2019 K.Kosako import sys import re @@ -13,18 +13,19 @@ PR_LINE_REG = re.compile("([0-9A-Fa-f]+)(?:..([0-9A-Fa-f]+))?\s*;\s*(\w+)") PA_LINE_REG = re.compile("(\w+)\s*;\s*(\w+)") PVA_LINE_REG = re.compile("(sc|gc)\s*;\s*(\w+)\s*;\s*(\w+)(?:\s*;\s*(\w+))?") BL_LINE_REG = re.compile("([0-9A-Fa-f]+)\.\.([0-9A-Fa-f]+)\s*;\s*(.*)") -VERSION_REG = re.compile("#\s*.*-(\d+\.\d+\.\d+)\.txt") +VERSION_REG = re.compile("#\s*.*-(\d+)\.(\d+)\.(\d+)\.txt") -VERSION_INFO = None +VERSION_INFO = [-1, -1, -1] DIC = { } PROPS = [] PropIndex = { } def check_version_info(s): - global VERSION_INFO m = VERSION_REG.match(s) if m is not None: - VERSION_INFO = m.group(1) + VERSION_INFO[0] = int(m.group(1)) + VERSION_INFO[1] = int(m.group(2)) + VERSION_INFO[2] = int(m.group(3)) def print_ranges(ranges): for (start, end) in ranges: @@ -160,7 +161,7 @@ def parse_properties(path): continue if s[0] == '#': - if VERSION_INFO is None: + if VERSION_INFO[0] < 0: check_version_info(s) m = PR_LINE_REG.match(s) @@ -194,7 +195,7 @@ PROPS = sorted(PROPS) print '/* unicode_egcb_data.c: Generated by make_unicode_egcb_data.py. */' COPYRIGHT = ''' /*- - * Copyright (c) 2017-2018 K.Kosako <sndgk393 AT ybb DOT ne DOT jp> + * Copyright (c) 2017-2019 K.Kosako * All rights reserved. * * Redistribution and use in source and binary forms, with or without @@ -222,9 +223,11 @@ COPYRIGHT = ''' print COPYRIGHT print '' -if VERSION_INFO is not None: - print "#define GRAPHEME_BREAK_PROPERTY_VERSION %s" % re.sub(r'[\.-]', '_', VERSION_INFO) - print '' +if VERSION_INFO[0] < 0: + raise RuntimeError("Version is not found") + +print "#define GRAPHEME_BREAK_PROPERTY_VERSION %02d%02d%02d" % (VERSION_INFO[0], VERSION_INFO[1], VERSION_INFO[2]) +print '' ranges = [] for prop in PROPS: diff --git a/src/make_unicode_fold.sh b/src/make_unicode_fold.sh index 35ce974..1d5cc1e 100755 --- a/src/make_unicode_fold.sh +++ b/src/make_unicode_fold.sh @@ -1,6 +1,6 @@ #!/bin/sh -GPERF=/usr/local/bin/gperf +GPERF=gperf TMP0=gperf0.tmp TMP1=gperf1.tmp diff --git a/src/make_unicode_fold_data.py b/src/make_unicode_fold_data.py index 783988c..55d5b88 100755 --- a/src/make_unicode_fold_data.py +++ b/src/make_unicode_fold_data.py @@ -1,7 +1,7 @@ #!/usr/bin/python # -*- coding: utf-8 -*- # make_unicode_fold_data.py -# Copyright (c) 2016-2018 K.Kosako +# Copyright (c) 2016-2019 K.Kosako import sys import re @@ -16,9 +16,9 @@ DataName = 'OnigUnicodeFolds' ENCODING = 'utf-8' LINE_REG = re.compile("([0-9A-F]{1,6}); (.); ([0-9A-F]{1,6})(?: ([0-9A-F]{1,6}))?(?: ([0-9A-F]{1,6}))?;(?:\s*#\s*)(.*)") -VERSION_REG = re.compile("#.*-(\d+\.\d+\.\d+)\.txt") +VERSION_REG = re.compile("#.*-(\d+)\.(\d+)\.(\d+)\.txt") -VERSION_INFO = None +VERSION_INFO = [-1, -1, -1] FOLDS = {} TURKISH_FOLDS = {} @@ -56,18 +56,19 @@ def form3bytes(x): return "\\x%02x\\x%02x\\x%02x" % (x2, x1, x0) def check_version_info(s): - global VERSION_INFO - if VERSION_INFO is None: - m = VERSION_REG.match(s) - if m is not None: - VERSION_INFO = m.group(1) + m = VERSION_REG.match(s) + if m is not None: + VERSION_INFO[0] = int(m.group(1)) + VERSION_INFO[1] = int(m.group(2)) + VERSION_INFO[2] = int(m.group(3)) def parse_line(s): if len(s) == 0: - return False + return False if s[0] == '#': + if VERSION_INFO[0] < 0: check_version_info(s) - return False + return False m = LINE_REG.match(s) if m is None: @@ -232,9 +233,11 @@ def output_fold_source(f, out_comment): print >> f, "/* This file was generated by make_unicode_fold_data.py. */" print >> f, '#include "regenc.h"' print >> f, '' - if VERSION_INFO is not None: - print "#define UNICODE_CASEFOLD_VERSION %s" % re.sub(r'[\.-]', '_', VERSION_INFO) - print '' + if VERSION_INFO[0] < 0: + raise RuntimeError("Version is not found") + + print "#define UNICODE_CASEFOLD_VERSION %02d%02d%02d" % (VERSION_INFO[0], VERSION_INFO[1], VERSION_INFO[2]) + print '' #output_macros(f, DataName) print >> f, '' #output_typedef(f) @@ -246,7 +249,7 @@ HEAD = ''' /* This gperf source file was generated by make_unicode_fold_data.py */ /*- - * Copyright (c) 2017-2018 K.Kosako <sndgk393 AT ybb DOT ne DOT jp> + * Copyright (c) 2017-2019 K.Kosako * All rights reserved. * * Redistribution and use in source and binary forms, with or without diff --git a/src/make_unicode_property.sh b/src/make_unicode_property.sh index 124d76a..51c8951 100755 --- a/src/make_unicode_property.sh +++ b/src/make_unicode_property.sh @@ -1,10 +1,11 @@ #!/bin/sh +GPERF=gperf + NAME=unicode_property_data TMP1=gperf1.tmp TMP2=gperf2.tmp TMP= -GPERF=/usr/local/bin/gperf GPERF_OPT='-T -C -c -t -j1 -L ANSI-C --ignore-case --pic -Q unicode_prop_name_pool' POOL_CAST='s/\(int *\)\(size_t *\)&\(\(struct +unicode_prop_name_pool_t *\* *\) *0\)->unicode_prop_name_pool_str([^,]+)/pool_offset(\1)/g' diff --git a/src/make_unicode_property_data.py b/src/make_unicode_property_data.py index dc3071a..9776628 100755 --- a/src/make_unicode_property_data.py +++ b/src/make_unicode_property_data.py @@ -1,7 +1,7 @@ #!/usr/bin/python # -*- coding: utf-8 -*- # make_unicode_property_data.py -# Copyright (c) 2016-2018 K.Kosako +# Copyright (c) 2016-2019 K.Kosako import sys import re @@ -22,9 +22,12 @@ PR_LINE_REG = re.compile("([0-9A-Fa-f]+)(?:..([0-9A-Fa-f]+))?\s*;\s*(\w+)") PA_LINE_REG = re.compile("(\w+)\s*;\s*(\w+)") PVA_LINE_REG = re.compile("(sc|gc)\s*;\s*(\w+)\s*;\s*(\w+)(?:\s*;\s*(\w+))?") BL_LINE_REG = re.compile("([0-9A-Fa-f]+)\.\.([0-9A-Fa-f]+)\s*;\s*(.*)") -VERSION_REG = re.compile("#\s*.*-(\d+\.\d+\.\d+)\.txt") +UNICODE_VERSION_REG = re.compile("#\s*.*-(\d+)\.(\d+)\.(\d+)\.txt") +EMOJI_VERSION_REG = re.compile("(?i)#\s*Version:\s*(\d+)\.(\d+)") + +VERSION_INFO = [-1, -1, -1] +EMOJI_VERSION_INFO = [-1, -1] -VERSION_INFO = None DIC = { } KDIC = { } PropIndex = { } @@ -40,13 +43,6 @@ def fix_block_name(name): s = re.sub(r'[- ]+', '_', name) return 'In_' + s -def check_version_info(s): - global VERSION_INFO - m = VERSION_REG.match(s) - if m is not None: - VERSION_INFO = m.group(1) - - def print_ranges(ranges): for (start, end) in ranges: print "0x%06x, 0x%06x" % (start, end) @@ -233,7 +229,8 @@ def parse_unicode_data_file(f): normalize_ranges_in_dic(dic) return dic, assigned -def parse_properties(path, klass, prop_prefix = None): +def parse_properties(path, klass, prop_prefix = None, version_reg = None): + version_match = None with open(path, 'r') as f: dic = { } prop = None @@ -243,9 +240,10 @@ def parse_properties(path, klass, prop_prefix = None): if len(s) == 0: continue - if s[0] == '#': - if VERSION_INFO is None: - check_version_info(s) + if s[0] == '#' and version_reg is not None and version_match is None: + version_match = version_reg.match(s) + if version_match is not None: + continue m = PR_LINE_REG.match(s) if m: @@ -266,7 +264,7 @@ def parse_properties(path, klass, prop_prefix = None): props.append(prop) normalize_ranges_in_dic(dic) - return (dic, props) + return (dic, props, version_match) def parse_property_aliases(path): a = { } @@ -414,11 +412,11 @@ def entry_and_print_prop_and_index(name, index): nname = normalize_prop_name(name) print_prop_and_index(nname, index) -def parse_and_merge_properties(path, klass): - dic, props = parse_properties(path, klass) +def parse_and_merge_properties(path, klass, prop_prefix = None, version_reg = None): + dic, props, ver_m = parse_properties(path, klass, prop_prefix, version_reg) merge_dic(DIC, dic) merge_props(PROPS, props) - return dic, props + return dic, props, ver_m ### main ### argv = sys.argv @@ -447,11 +445,21 @@ with open('UnicodeData.txt', 'r') as f: PROPS = DIC.keys() PROPS = list_sub(PROPS, POSIX_LIST) -parse_and_merge_properties('DerivedCoreProperties.txt', 'Derived Property') -dic, props = parse_and_merge_properties('Scripts.txt', 'Script') +_, _, ver_m = parse_and_merge_properties('DerivedCoreProperties.txt', 'Derived Property', None, UNICODE_VERSION_REG) +if ver_m is not None: + VERSION_INFO[0] = int(ver_m.group(1)) + VERSION_INFO[1] = int(ver_m.group(2)) + VERSION_INFO[2] = int(ver_m.group(3)) + +dic, props, _ = parse_and_merge_properties('Scripts.txt', 'Script') DIC['Unknown'] = inverse_ranges(add_ranges_in_dic(dic)) + parse_and_merge_properties('PropList.txt', 'Binary Property') -parse_and_merge_properties('emoji-data.txt', 'Emoji Property') + +_, _, ver_m = parse_and_merge_properties('emoji-data.txt', 'Emoji Property', None, EMOJI_VERSION_REG) +if ver_m is not None: + EMOJI_VERSION_INFO[0] = int(ver_m.group(1)) + EMOJI_VERSION_INFO[1] = int(ver_m.group(2)) PROPS.append('Unknown') KDIC['Unknown'] = 'Script' @@ -464,9 +472,9 @@ dic, BLOCKS = parse_blocks('Blocks.txt') merge_dic(DIC, dic) if INCLUDE_GRAPHEME_CLUSTER_DATA: - dic, props = parse_properties('GraphemeBreakProperty.txt', - 'GraphemeBreak Property', - GRAPHEME_CLUSTER_BREAK_NAME_PREFIX) + dic, props, _ = parse_properties('GraphemeBreakProperty.txt', + 'GraphemeBreak Property', + GRAPHEME_CLUSTER_BREAK_NAME_PREFIX) merge_dic(DIC, dic) merge_props(PROPS, props) #prop = GRAPHEME_CLUSTER_BREAK_NAME_PREFIX + 'Other' @@ -533,9 +541,13 @@ sys.stdout.write(s) if OUTPUT_LIST_MODE: UPF = open("UNICODE_PROPERTIES", "w") - if VERSION_INFO is not None: - print >> UPF, "Unicode Properties (from Unicode Version: %s)" % VERSION_INFO - print >> UPF, '' + if VERSION_INFO[0] < 0: + raise RuntimeError("Unicode Version is not found") + if EMOJI_VERSION_INFO[0] < 0: + raise RuntimeError("Emoji Version is not found") + + print >> UPF, "Unicode Properties (Unicode Version: %d.%d.%d, Emoji: %d.%d)" % (VERSION_INFO[0], VERSION_INFO[1], VERSION_INFO[2], EMOJI_VERSION_INFO[0], EMOJI_VERSION_INFO[1]) + print >> UPF, '' index = -1 for prop in POSIX_LIST: @@ -569,9 +581,14 @@ if not(POSIX_ONLY): print '%%' print '' if not(POSIX_ONLY): - if VERSION_INFO is not None: - print "#define UNICODE_PROPERTY_VERSION %s" % re.sub(r'[\.-]', '_', VERSION_INFO) - print '' + if VERSION_INFO[0] < 0: + raise RuntimeError("Unicode Version is not found") + if EMOJI_VERSION_INFO[0] < 0: + raise RuntimeError("Emoji Version is not found") + + print "#define UNICODE_PROPERTY_VERSION %02d%02d%02d" % (VERSION_INFO[0], VERSION_INFO[1], VERSION_INFO[2]) + print "#define UNICODE_EMOJI_VERSION %02d%02d" % (EMOJI_VERSION_INFO[0], EMOJI_VERSION_INFO[1]) + print '' print "#define PROPERTY_NAME_MAX_SIZE %d" % (PROPERTY_NAME_MAX_LEN + 10) print "#define CODE_RANGES_NUM %d" % (index + 1) diff --git a/src/make_unicode_wb_data.py b/src/make_unicode_wb_data.py index 624fa7e..ddedd5d 100755 --- a/src/make_unicode_wb_data.py +++ b/src/make_unicode_wb_data.py @@ -13,18 +13,19 @@ PR_LINE_REG = re.compile("([0-9A-Fa-f]+)(?:..([0-9A-Fa-f]+))?\s*;\s*(\w+)") PA_LINE_REG = re.compile("(\w+)\s*;\s*(\w+)") PVA_LINE_REG = re.compile("(sc|gc)\s*;\s*(\w+)\s*;\s*(\w+)(?:\s*;\s*(\w+))?") BL_LINE_REG = re.compile("([0-9A-Fa-f]+)\.\.([0-9A-Fa-f]+)\s*;\s*(.*)") -VERSION_REG = re.compile("#\s*.*-(\d+\.\d+\.\d+)\.txt") +VERSION_REG = re.compile("#\s*.*-(\d+)\.(\d+)\.(\d+)\.txt") -VERSION_INFO = None +VERSION_INFO = [-1, -1, -1] DIC = { } PROPS = [] PropIndex = { } def check_version_info(s): - global VERSION_INFO m = VERSION_REG.match(s) if m is not None: - VERSION_INFO = m.group(1) + VERSION_INFO[0] = int(m.group(1)) + VERSION_INFO[1] = int(m.group(2)) + VERSION_INFO[2] = int(m.group(3)) def print_ranges(ranges): for (start, end) in ranges: @@ -160,7 +161,7 @@ def parse_properties(path): continue if s[0] == '#': - if VERSION_INFO is None: + if VERSION_INFO[0] < 0: check_version_info(s) m = PR_LINE_REG.match(s) @@ -194,7 +195,7 @@ PROPS = sorted(PROPS) print '/* unicode_wb_data.c: Generated by make_unicode_wb_data.py. */' COPYRIGHT = ''' /*- - * Copyright (c) 2019 K.Kosako <kkosako0 AT gmail DOT com> + * Copyright (c) 2019 K.Kosako * All rights reserved. * * Redistribution and use in source and binary forms, with or without @@ -222,9 +223,11 @@ COPYRIGHT = ''' print COPYRIGHT print '' -if VERSION_INFO is not None: - print "#define WORD_BREAK_PROPERTY_VERSION %s" % re.sub(r'[\.-]', '_', VERSION_INFO) - print '' +if VERSION_INFO[0] < 0: + raise RuntimeError("Version is not found.") + +print "#define WORD_BREAK_PROPERTY_VERSION %02d%02d%02d" % (VERSION_INFO[0], VERSION_INFO[1], VERSION_INFO[2]) +print '' ranges = [] for prop in PROPS: diff --git a/src/mktable.c b/src/mktable.c index 80ac08a..318bac0 100644 --- a/src/mktable.c +++ b/src/mktable.c @@ -2,7 +2,7 @@ mktable.c **********************************************************************/ /*- - * Copyright (c) 2002-2019 K.Kosako <sndgk393 AT ybb DOT ne DOT jp> + * Copyright (c) 2002-2019 K.Kosako * All rights reserved. * * Redistribution and use in source and binary forms, with or without diff --git a/src/onig_init.c b/src/onig_init.c index 7ad98b7..c660e7d 100644 --- a/src/onig_init.c +++ b/src/onig_init.c @@ -2,7 +2,7 @@ onig_init.c - Oniguruma (regular expression library) **********************************************************************/ /*- - * Copyright (c) 2016-2018 K.Kosako <sndgk393 AT ybb DOT ne DOT jp> + * Copyright (c) 2016-2019 K.Kosako * All rights reserved. * * Redistribution and use in source and binary forms, with or without diff --git a/src/oniggnu.h b/src/oniggnu.h index d688883..96d9085 100644 --- a/src/oniggnu.h +++ b/src/oniggnu.h @@ -4,7 +4,7 @@ oniggnu.h - Oniguruma (regular expression library) **********************************************************************/ /*- - * Copyright (c) 2002-2005 K.Kosako <sndgk393 AT ybb DOT ne DOT jp> + * Copyright (c) 2002-2019 K.Kosako * All rights reserved. * * Redistribution and use in source and binary forms, with or without diff --git a/src/onigposix.h b/src/onigposix.h index da0f919..5ff779f 100644 --- a/src/onigposix.h +++ b/src/onigposix.h @@ -4,7 +4,7 @@ onigposix.h - Oniguruma (regular expression library) **********************************************************************/ /*- - * Copyright (c) 2002-2018 K.Kosako <sndgk393 AT ybb DOT ne DOT jp> + * Copyright (c) 2002-2019 K.Kosako * All rights reserved. * * Redistribution and use in source and binary forms, with or without @@ -95,6 +95,7 @@ typedef struct { #endif #endif +#ifndef ONIG_STATIC #ifndef ONIG_EXTERN #if defined(_WIN32) && !defined(__GNUC__) #if defined(ONIGURUMA_EXPORT) @@ -108,6 +109,9 @@ typedef struct { #ifndef ONIG_EXTERN #define ONIG_EXTERN extern #endif +#else +#define ONIG_EXTERN extern +#endif #ifndef ONIGURUMA_H typedef unsigned int OnigOptionType; diff --git a/src/oniguruma.h b/src/oniguruma.h index 90cf2d9..08ac6f7 100644 --- a/src/oniguruma.h +++ b/src/oniguruma.h @@ -4,7 +4,7 @@ oniguruma.h - Oniguruma (regular expression library) **********************************************************************/ /*- - * Copyright (c) 2002-2019 K.Kosako <sndgk393 AT ybb DOT ne DOT jp> + * Copyright (c) 2002-2019 K.Kosako * All rights reserved. * * Redistribution and use in source and binary forms, with or without @@ -36,9 +36,9 @@ extern "C" { #define ONIGURUMA #define ONIGURUMA_VERSION_MAJOR 6 #define ONIGURUMA_VERSION_MINOR 9 -#define ONIGURUMA_VERSION_TEENY 3 +#define ONIGURUMA_VERSION_TEENY 4 -#define ONIGURUMA_VERSION_INT 60903 +#define ONIGURUMA_VERSION_INT 60904 #ifndef P_ #if defined(__STDC__) || defined(_WIN32) @@ -687,6 +687,14 @@ typedef OnigRegexType* OnigRegex; typedef OnigRegexType regex_t; #endif +struct OnigRegSetStruct; +typedef struct OnigRegSetStruct OnigRegSet; + +typedef enum { + ONIG_REGSET_POSITION_LEAD = 0, + ONIG_REGSET_REGEX_LEAD = 1, + ONIG_REGSET_PRIORITY_TO_REGEX_ORDER = 2 +} OnigRegSetLead; typedef struct { int num_of_elements; @@ -797,6 +805,26 @@ ONIG_EXTERN int onig_match P_((OnigRegex, const OnigUChar* str, const OnigUChar* end, const OnigUChar* at, OnigRegion* region, OnigOptionType option)); ONIG_EXTERN int onig_match_with_param P_((OnigRegex, const OnigUChar* str, const OnigUChar* end, const OnigUChar* at, OnigRegion* region, OnigOptionType option, OnigMatchParam* mp)); + +ONIG_EXTERN +int onig_regset_new P_((OnigRegSet** rset, int n, regex_t* regs[])); +ONIG_EXTERN +int onig_regset_add P_((OnigRegSet* set, regex_t* reg)); +ONIG_EXTERN +int onig_regset_replace P_((OnigRegSet* set, int at, regex_t* reg)); +ONIG_EXTERN +void onig_regset_free P_((OnigRegSet* set)); +ONIG_EXTERN +int onig_regset_number_of_regex P_((OnigRegSet* set)); +ONIG_EXTERN +regex_t* onig_regset_get_regex P_((OnigRegSet* set, int at)); +ONIG_EXTERN +OnigRegion* onig_regset_get_region P_((OnigRegSet* set, int at)); +ONIG_EXTERN +int onig_regset_search P_((OnigRegSet* set, const OnigUChar* str, const OnigUChar* end, const OnigUChar* start, const OnigUChar* range, OnigRegSetLead lead, OnigOptionType option, int* rmatch_pos)); +ONIG_EXTERN +int onig_regset_search_with_param P_((OnigRegSet* set, const OnigUChar* str, const OnigUChar* end, const OnigUChar* start, const OnigUChar* range, OnigRegSetLead lead, OnigOptionType option, OnigMatchParam* mps[], int* rmatch_pos)); + ONIG_EXTERN OnigRegion* onig_region_new P_((void)); ONIG_EXTERN diff --git a/src/regcomp.c b/src/regcomp.c index b96c793..69d4b95 100644 --- a/src/regcomp.c +++ b/src/regcomp.c @@ -2,7 +2,7 @@ regcomp.c - Oniguruma (regular expression library) **********************************************************************/ /*- - * Copyright (c) 2002-2019 K.Kosako <sndgk393 AT ybb DOT ne DOT jp> + * Copyright (c) 2002-2019 K.Kosako * All rights reserved. * * Redistribution and use in source and binary forms, with or without @@ -224,17 +224,17 @@ ops_free(regex_t* reg) #endif switch (opcode) { - case OP_EXACTMBN: + case OP_STR_MBN: if (! is_in_string_pool(reg, op->exact_len_n.s)) xfree(op->exact_len_n.s); break; - case OP_EXACTN: case OP_EXACTMB2N: case OP_EXACTMB3N: case OP_EXACTN_IC: + case OP_STR_N: case OP_STR_MB2N: case OP_STR_MB3N: case OP_STR_N_IC: if (! is_in_string_pool(reg, op->exact_n.s)) xfree(op->exact_n.s); break; - case OP_EXACT1: case OP_EXACT2: case OP_EXACT3: case OP_EXACT4: - case OP_EXACT5: case OP_EXACTMB2N1: case OP_EXACTMB2N2: - case OP_EXACTMB2N3: case OP_EXACT1_IC: + case OP_STR_1: case OP_STR_2: case OP_STR_3: case OP_STR_4: + case OP_STR_5: case OP_STR_MB2N1: case OP_STR_MB2N2: + case OP_STR_MB2N3: case OP_STR_1_IC: break; case OP_CCLASS_NOT: case OP_CCLASS: @@ -298,17 +298,17 @@ ops_calc_size_of_string_pool(regex_t* reg) #endif switch (opcode) { - case OP_EXACTMBN: + case OP_STR_MBN: total += op->exact_len_n.len * op->exact_len_n.n; break; - case OP_EXACTN: - case OP_EXACTN_IC: + case OP_STR_N: + case OP_STR_N_IC: total += op->exact_n.n; break; - case OP_EXACTMB2N: + case OP_STR_MB2N: total += op->exact_n.n * 2; break; - case OP_EXACTMB3N: + case OP_STR_MB3N: total += op->exact_n.n * 3; break; @@ -349,15 +349,15 @@ ops_make_string_pool(regex_t* reg) #endif switch (opcode) { - case OP_EXACTMBN: + case OP_STR_MBN: len = op->exact_len_n.len * op->exact_len_n.n; xmemcpy(curr, op->exact_len_n.s, len); xfree(op->exact_len_n.s); op->exact_len_n.s = curr; curr += len; break; - case OP_EXACTN: - case OP_EXACTN_IC: + case OP_STR_N: + case OP_STR_N_IC: len = op->exact_n.n; copy: xmemcpy(curr, op->exact_n.s, len); @@ -365,11 +365,11 @@ ops_make_string_pool(regex_t* reg) op->exact_n.s = curr; curr += len; break; - case OP_EXACTMB2N: + case OP_STR_MB2N: len = op->exact_n.n * 2; goto copy; break; - case OP_EXACTMB3N: + case OP_STR_MB3N: len = op->exact_n.n * 3; goto copy; break; @@ -427,7 +427,7 @@ onig_positive_int_multiply(int x, int y) static void -swap_node(Node* a, Node* b) +node_swap(Node* a, Node* b) { Node c; @@ -452,6 +452,81 @@ swap_node(Node* a, Node* b) } } +static int +node_list_len(Node* list) +{ + int len; + + len = 1; + while (IS_NOT_NULL(NODE_CDR(list))) { + list = NODE_CDR(list); + len++; + } + + return len; +} + +static Node* +node_list_add(Node* list, Node* x) +{ + Node *n; + + n = onig_node_new_list(x, NULL); + if (IS_NULL(n)) return NULL_NODE; + + if (IS_NOT_NULL(list)) { + while (IS_NOT_NULL(NODE_CDR(list))) + list = NODE_CDR(list); + + NODE_CDR(list) = n; + } + + return n; +} + +static int +node_str_node_cat(Node* node, Node* add) +{ + int r; + + if (STR_(node)->flag != STR_(add)->flag) + return ONIGERR_TYPE_BUG; + + r = onig_node_str_cat(node, STR_(add)->s, STR_(add)->end); + if (r != 0) return r; + + if (NODE_STRING_IS_CASE_FOLD_MATCH(node)) + STR_(node)->case_min_len += STR_(add)->case_min_len; + + return 0; +} + +static int +node_str_cat_case_fold(Node* node, const UChar* s, const UChar* end, int case_min_len) +{ + int r; + + if (! NODE_STRING_IS_CASE_FOLD_MATCH(node)) + return ONIGERR_TYPE_BUG; + + r = onig_node_str_cat(node, s, end); + if (r != 0) return r; + + STR_(node)->case_min_len += case_min_len; + return 0; +} + +static void +node_conv_to_str_node(Node* node, int flag) +{ + NODE_SET_TYPE(node, NODE_STRING); + STR_(node)->flag = flag; + STR_(node)->s = STR_(node)->buf; + STR_(node)->end = STR_(node)->buf; + STR_(node)->capacity = 0; + STR_(node)->case_min_len = 0; +} + static OnigLen distance_add(OnigLen d1, OnigLen d2) { @@ -549,52 +624,45 @@ static int compile_length_tree(Node* node, regex_t* reg); static int compile_tree(Node* node, regex_t* reg, ScanEnv* env); -#define IS_NEED_STR_LEN_OP_EXACT(op) \ - ((op) == OP_EXACTN || (op) == OP_EXACTMB2N ||\ - (op) == OP_EXACTMB3N || (op) == OP_EXACTMBN || (op) == OP_EXACTN_IC) +#define IS_NEED_STR_LEN_OP(op) \ + ((op) == OP_STR_N || (op) == OP_STR_MB2N ||\ + (op) == OP_STR_MB3N || (op) == OP_STR_MBN || (op) == OP_STR_N_IC) static int -select_str_opcode(int mb_len, int str_len, int ignore_case) +select_str_opcode(int mb_len, int str_len) { int op; - if (ignore_case) { + switch (mb_len) { + case 1: switch (str_len) { - case 1: op = OP_EXACT1_IC; break; - default: op = OP_EXACTN_IC; break; + case 1: op = OP_STR_1; break; + case 2: op = OP_STR_2; break; + case 3: op = OP_STR_3; break; + case 4: op = OP_STR_4; break; + case 5: op = OP_STR_5; break; + default: op = OP_STR_N; break; } - } - else { - switch (mb_len) { - case 1: - switch (str_len) { - case 1: op = OP_EXACT1; break; - case 2: op = OP_EXACT2; break; - case 3: op = OP_EXACT3; break; - case 4: op = OP_EXACT4; break; - case 5: op = OP_EXACT5; break; - default: op = OP_EXACTN; break; - } - break; + break; - case 2: - switch (str_len) { - case 1: op = OP_EXACTMB2N1; break; - case 2: op = OP_EXACTMB2N2; break; - case 3: op = OP_EXACTMB2N3; break; - default: op = OP_EXACTMB2N; break; - } - break; + case 2: + switch (str_len) { + case 1: op = OP_STR_MB2N1; break; + case 2: op = OP_STR_MB2N2; break; + case 3: op = OP_STR_MB2N3; break; + default: op = OP_STR_MB2N; break; + } + break; - case 3: - op = OP_EXACTMB3N; - break; + case 3: + op = OP_STR_MB3N; + break; - default: - op = OP_EXACTMBN; - break; - } + default: + op = OP_STR_MBN; + break; } + return op; } @@ -621,31 +689,43 @@ is_strict_real_node(Node* node) } static int -compile_tree_empty_check(Node* node, regex_t* reg, int emptiness, ScanEnv* env) +compile_quant_body_with_empty_check(QuantNode* qn, regex_t* reg, ScanEnv* env) { int r; - int saved_num_null_check = reg->num_null_check; + int saved_num_empty_check; + int emptiness; + Node* body; + + body = NODE_BODY((Node* )qn); + emptiness = qn->emptiness; + saved_num_empty_check = reg->num_empty_check; if (emptiness != BODY_IS_NOT_EMPTY) { r = add_op(reg, OP_EMPTY_CHECK_START); if (r != 0) return r; - COP(reg)->empty_check_start.mem = reg->num_null_check; /* NULL CHECK ID */ - reg->num_null_check++; + COP(reg)->empty_check_start.mem = reg->num_empty_check; /* NULL CHECK ID */ + reg->num_empty_check++; } - r = compile_tree(node, reg, env); + r = compile_tree(body, reg, env); if (r != 0) return r; if (emptiness != BODY_IS_NOT_EMPTY) { if (emptiness == BODY_IS_EMPTY_POSSIBILITY) r = add_op(reg, OP_EMPTY_CHECK_END); - else if (emptiness == BODY_IS_EMPTY_POSSIBILITY_MEM) - r = add_op(reg, OP_EMPTY_CHECK_END_MEMST); + else if (emptiness == BODY_IS_EMPTY_POSSIBILITY_MEM) { + if (NODE_IS_EMPTY_STATUS_CHECK(qn) != 0) + r = add_op(reg, OP_EMPTY_CHECK_END_MEMST); + else + r = add_op(reg, OP_EMPTY_CHECK_END); + } +#ifdef USE_CALL else if (emptiness == BODY_IS_EMPTY_POSSIBILITY_REC) r = add_op(reg, OP_EMPTY_CHECK_END_MEMST_PUSH); +#endif if (r != 0) return r; - COP(reg)->empty_check_end.mem = saved_num_null_check; /* NULL CHECK ID */ + COP(reg)->empty_check_end.mem = saved_num_empty_check; /* NULL CHECK ID */ } return r; } @@ -682,14 +762,13 @@ compile_tree_n_times(Node* node, int n, regex_t* reg, ScanEnv* env) static int add_compile_string_length(UChar* s ARG_UNUSED, int mb_len, int str_len, - regex_t* reg ARG_UNUSED, int ignore_case) + regex_t* reg ARG_UNUSED) { return 1; } static int -add_compile_string(UChar* s, int mb_len, int str_len, - regex_t* reg, int ignore_case) +add_compile_string(UChar* s, int mb_len, int str_len, regex_t* reg) { int op; int r; @@ -697,14 +776,14 @@ add_compile_string(UChar* s, int mb_len, int str_len, UChar* p; UChar* end; - op = select_str_opcode(mb_len, str_len, ignore_case); + op = select_str_opcode(mb_len, str_len); r = add_op(reg, op); if (r != 0) return r; byte_len = mb_len * str_len; end = s + byte_len; - if (op == OP_EXACTMBN) { + if (op == OP_STR_MBN) { p = onigenc_strdup(reg->enc, s, end); CHECK_NULL_RETURN_MEMERR(p); @@ -712,11 +791,11 @@ add_compile_string(UChar* s, int mb_len, int str_len, COP(reg)->exact_len_n.n = str_len; COP(reg)->exact_len_n.s = p; } - else if (IS_NEED_STR_LEN_OP_EXACT(op)) { + else if (IS_NEED_STR_LEN_OP(op)) { p = onigenc_strdup(reg->enc, s, end); CHECK_NULL_RETURN_MEMERR(p); - if (op == OP_EXACTN_IC) + if (op == OP_STR_N_IC) COP(reg)->exact_n.n = byte_len; else COP(reg)->exact_n.n = str_len; @@ -724,8 +803,8 @@ add_compile_string(UChar* s, int mb_len, int str_len, COP(reg)->exact_n.s = p; } else { + xmemset(COP(reg)->exact.s, 0, sizeof(COP(reg)->exact.s)); xmemcpy(COP(reg)->exact.s, s, (size_t )byte_len); - COP(reg)->exact.s[byte_len] = '\0'; } return 0; @@ -734,7 +813,7 @@ add_compile_string(UChar* s, int mb_len, int str_len, static int compile_length_string_node(Node* node, regex_t* reg) { - int rlen, r, len, prev_len, slen, ambig; + int rlen, r, len, prev_len, slen; UChar *p, *prev; StrNode* sn; OnigEncoding enc = reg->enc; @@ -743,7 +822,7 @@ compile_length_string_node(Node* node, regex_t* reg) if (sn->end <= sn->s) return 0; - ambig = NODE_STRING_IS_AMBIG(node); + if (NODE_STRING_IS_CASE_FOLD_MATCH(node) != 0) return 1; p = prev = sn->s; prev_len = enclen(enc, p); @@ -757,7 +836,7 @@ compile_length_string_node(Node* node, regex_t* reg) slen++; } else { - r = add_compile_string_length(prev, prev_len, slen, reg, ambig); + r = add_compile_string_length(prev, prev_len, slen, reg); rlen += r; prev = p; slen = 1; @@ -766,25 +845,59 @@ compile_length_string_node(Node* node, regex_t* reg) p += len; } - r = add_compile_string_length(prev, prev_len, slen, reg, ambig); + r = add_compile_string_length(prev, prev_len, slen, reg); rlen += r; return rlen; } static int -compile_length_string_raw_node(StrNode* sn, regex_t* reg) +compile_length_string_crude_node(StrNode* sn, regex_t* reg) { if (sn->end <= sn->s) return 0; return add_compile_string_length(sn->s, 1 /* sb */, (int )(sn->end - sn->s), - reg, 0); + reg); +} + +static int +compile_ambig_string_node(Node* node, regex_t* reg) +{ + int r; + int len; + int byte_len; + UChar* p; + StrNode* sn; + OnigEncoding enc = reg->enc; + + sn = STR_(node); + len = enclen(enc, sn->s); + byte_len = (int )(sn->end - sn->s); + if (len == byte_len) { + r = add_op(reg, OP_STR_1_IC); + if (r != 0) return r; + + xmemset(COP(reg)->exact.s, 0, sizeof(COP(reg)->exact.s)); + xmemcpy(COP(reg)->exact.s, sn->s, (size_t )byte_len); + } + else { + r = add_op(reg, OP_STR_N_IC); + if (r != 0) return r; + + p = onigenc_strdup(enc, sn->s, sn->end); + CHECK_NULL_RETURN_MEMERR(p); + + COP(reg)->exact_n.s = p; + COP(reg)->exact_n.n = byte_len; + } + + return 0; } static int compile_string_node(Node* node, regex_t* reg) { - int r, len, prev_len, slen, ambig; + int r, len, prev_len, slen; UChar *p, *prev, *end; StrNode* sn; OnigEncoding enc = reg->enc; @@ -794,7 +907,9 @@ compile_string_node(Node* node, regex_t* reg) return 0; end = sn->end; - ambig = NODE_STRING_IS_AMBIG(node); + if (NODE_STRING_IS_CASE_FOLD_MATCH(node) != 0) { + return compile_ambig_string_node(node, reg); + } p = prev = sn->s; prev_len = enclen(enc, p); @@ -807,7 +922,7 @@ compile_string_node(Node* node, regex_t* reg) slen++; } else { - r = add_compile_string(prev, prev_len, slen, reg, ambig); + r = add_compile_string(prev, prev_len, slen, reg); if (r != 0) return r; prev = p; @@ -818,16 +933,16 @@ compile_string_node(Node* node, regex_t* reg) p += len; } - return add_compile_string(prev, prev_len, slen, reg, ambig); + return add_compile_string(prev, prev_len, slen, reg); } static int -compile_string_raw_node(StrNode* sn, regex_t* reg) +compile_string_crude_node(StrNode* sn, regex_t* reg) { if (sn->end <= sn->s) return 0; - return add_compile_string(sn->s, 1 /* sb */, (int )(sn->end - sn->s), reg, 0); + return add_compile_string(sn->s, 1 /* sb */, (int )(sn->end - sn->s), reg); } static void* @@ -891,15 +1006,27 @@ compile_cclass_node(CClassNode* cc, regex_t* reg) return 0; } +static void +set_addr_in_repeat_range(regex_t* reg) +{ + int i; + + for (i = 0; i < reg->num_repeat; i++) { + RepeatRange* p = reg->repeat_range + i; + int offset = p->u.offset; + p->u.pcode = reg->ops + offset; + } +} + static int -entry_repeat_range(regex_t* reg, int id, int lower, int upper) +entry_repeat_range(regex_t* reg, int id, int lower, int upper, int ops_index) { #define REPEAT_RANGE_ALLOC 4 - OnigRepeatRange* p; + RepeatRange* p; if (reg->repeat_range_alloc == 0) { - p = (OnigRepeatRange* )xmalloc(sizeof(OnigRepeatRange) * REPEAT_RANGE_ALLOC); + p = (RepeatRange* )xmalloc(sizeof(RepeatRange) * REPEAT_RANGE_ALLOC); CHECK_NULL_RETURN_MEMERR(p); reg->repeat_range = p; reg->repeat_range_alloc = REPEAT_RANGE_ALLOC; @@ -907,7 +1034,7 @@ entry_repeat_range(regex_t* reg, int id, int lower, int upper) else if (reg->repeat_range_alloc <= id) { int n; n = reg->repeat_range_alloc + REPEAT_RANGE_ALLOC; - p = (OnigRepeatRange* )xrealloc(reg->repeat_range, sizeof(OnigRepeatRange) * n); + p = (RepeatRange* )xrealloc(reg->repeat_range, sizeof(RepeatRange) * n); CHECK_NULL_RETURN_MEMERR(p); reg->repeat_range = p; reg->repeat_range_alloc = n; @@ -916,8 +1043,9 @@ entry_repeat_range(regex_t* reg, int id, int lower, int upper) p = reg->repeat_range; } - p[id].lower = lower; - p[id].upper = (IS_INFINITE_REPEAT(upper) ? 0x7fffffff : upper); + p[id].lower = lower; + p[id].upper = (IS_INFINITE_REPEAT(upper) ? 0x7fffffff : upper); + p[id].u.offset = ops_index; return 0; } @@ -932,24 +1060,16 @@ compile_range_repeat_node(QuantNode* qn, int target_len, int emptiness, if (r != 0) return r; COP(reg)->repeat.id = num_repeat; - COP(reg)->repeat.addr = SIZE_INC_OP + target_len + SIZE_OP_REPEAT_INC; + COP(reg)->repeat.addr = SIZE_INC + target_len + OPSIZE_REPEAT_INC; - r = entry_repeat_range(reg, num_repeat, qn->lower, qn->upper); + r = entry_repeat_range(reg, num_repeat, qn->lower, qn->upper, + COP_CURR_OFFSET(reg) + OPSIZE_REPEAT); if (r != 0) return r; - r = compile_tree_empty_check(NODE_QUANT_BODY(qn), reg, emptiness, env); + r = compile_quant_body_with_empty_check(qn, reg, env); if (r != 0) return r; - if ( -#ifdef USE_CALL - NODE_IS_IN_MULTI_ENTRY(qn) || -#endif - NODE_IS_IN_REAL_REPEAT(qn)) { - r = add_op(reg, qn->greedy ? OP_REPEAT_INC_SG : OP_REPEAT_INC_NG_SG); - } - else { - r = add_op(reg, qn->greedy ? OP_REPEAT_INC : OP_REPEAT_INC_NG); - } + r = add_op(reg, qn->greedy ? OP_REPEAT_INC : OP_REPEAT_INC_NG); if (r != 0) return r; COP(reg)->repeat_inc.id = num_repeat; @@ -985,21 +1105,21 @@ compile_length_quantifier_node(QuantNode* qn, regex_t* reg) if (qn->lower <= 1 || int_multiply_cmp(tlen, qn->lower, QUANTIFIER_EXPAND_LIMIT_SIZE) <= 0) { if (IS_NOT_NULL(qn->next_head_exact)) - return SIZE_OP_ANYCHAR_STAR_PEEK_NEXT + tlen * qn->lower; + return OPSIZE_ANYCHAR_STAR_PEEK_NEXT + tlen * qn->lower; else - return SIZE_OP_ANYCHAR_STAR + tlen * qn->lower; + return OPSIZE_ANYCHAR_STAR + tlen * qn->lower; } } mod_tlen = tlen; if (emptiness != BODY_IS_NOT_EMPTY) - mod_tlen += SIZE_OP_EMPTY_CHECK_START + SIZE_OP_EMPTY_CHECK_END; + mod_tlen += OPSIZE_EMPTY_CHECK_START + OPSIZE_EMPTY_CHECK_END; if (infinite && (qn->lower <= 1 || int_multiply_cmp(tlen, qn->lower, QUANTIFIER_EXPAND_LIMIT_SIZE) <= 0)) { if (qn->lower == 1 && tlen > QUANTIFIER_EXPAND_LIMIT_SIZE) { - len = SIZE_OP_JUMP; + len = OPSIZE_JUMP; } else { len = tlen * qn->lower; @@ -1008,36 +1128,36 @@ compile_length_quantifier_node(QuantNode* qn, regex_t* reg) if (qn->greedy) { #ifdef USE_OP_PUSH_OR_JUMP_EXACT if (IS_NOT_NULL(qn->head_exact)) - len += SIZE_OP_PUSH_OR_JUMP_EXACT1 + mod_tlen + SIZE_OP_JUMP; + len += OPSIZE_PUSH_OR_JUMP_EXACT1 + mod_tlen + OPSIZE_JUMP; else #endif if (IS_NOT_NULL(qn->next_head_exact)) - len += SIZE_OP_PUSH_IF_PEEK_NEXT + mod_tlen + SIZE_OP_JUMP; + len += OPSIZE_PUSH_IF_PEEK_NEXT + mod_tlen + OPSIZE_JUMP; else - len += SIZE_OP_PUSH + mod_tlen + SIZE_OP_JUMP; + len += OPSIZE_PUSH + mod_tlen + OPSIZE_JUMP; } else - len += SIZE_OP_JUMP + mod_tlen + SIZE_OP_PUSH; + len += OPSIZE_JUMP + mod_tlen + OPSIZE_PUSH; } else if (qn->upper == 0) { - if (qn->is_refered != 0) { /* /(?<n>..){0}/ */ - len = SIZE_OP_JUMP + tlen; + if (qn->include_referred != 0) { /* /(?<n>..){0}/ */ + len = OPSIZE_JUMP + tlen; } else len = 0; } else if (!infinite && qn->greedy && (qn->upper == 1 || - int_multiply_cmp(tlen + SIZE_OP_PUSH, qn->upper, + int_multiply_cmp(tlen + OPSIZE_PUSH, qn->upper, QUANTIFIER_EXPAND_LIMIT_SIZE) <= 0)) { len = tlen * qn->lower; - len += (SIZE_OP_PUSH + tlen) * (qn->upper - qn->lower); + len += (OPSIZE_PUSH + tlen) * (qn->upper - qn->lower); } else if (!qn->greedy && qn->upper == 1 && qn->lower == 0) { /* '??' */ - len = SIZE_OP_PUSH + SIZE_OP_JUMP + tlen; + len = OPSIZE_PUSH + OPSIZE_JUMP + tlen; } else { - len = SIZE_OP_REPEAT_INC + mod_tlen + SIZE_OP_REPEAT; + len = OPSIZE_REPEAT_INC + mod_tlen + OPSIZE_REPEAT; } return len; @@ -1078,7 +1198,7 @@ compile_quantifier_node(QuantNode* qn, regex_t* reg, ScanEnv* env) mod_tlen = tlen; if (emptiness != BODY_IS_NOT_EMPTY) - mod_tlen += SIZE_OP_EMPTY_CHECK_START + SIZE_OP_EMPTY_CHECK_END; + mod_tlen += OPSIZE_EMPTY_CHECK_START + OPSIZE_EMPTY_CHECK_END; if (infinite && (qn->lower <= 1 || @@ -1091,16 +1211,16 @@ compile_quantifier_node(QuantNode* qn, regex_t* reg, ScanEnv* env) if (qn->greedy) { #ifdef USE_OP_PUSH_OR_JUMP_EXACT if (IS_NOT_NULL(qn->head_exact)) - COP(reg)->jump.addr = SIZE_OP_PUSH_OR_JUMP_EXACT1 + SIZE_INC_OP; + COP(reg)->jump.addr = OPSIZE_PUSH_OR_JUMP_EXACT1 + SIZE_INC; else #endif if (IS_NOT_NULL(qn->next_head_exact)) - COP(reg)->jump.addr = SIZE_OP_PUSH_IF_PEEK_NEXT + SIZE_INC_OP; + COP(reg)->jump.addr = OPSIZE_PUSH_IF_PEEK_NEXT + SIZE_INC; else - COP(reg)->jump.addr = SIZE_OP_PUSH + SIZE_INC_OP; + COP(reg)->jump.addr = OPSIZE_PUSH + SIZE_INC; } else { - COP(reg)->jump.addr = SIZE_OP_JUMP + SIZE_INC_OP; + COP(reg)->jump.addr = OPSIZE_JUMP + SIZE_INC; } } else { @@ -1113,36 +1233,36 @@ compile_quantifier_node(QuantNode* qn, regex_t* reg, ScanEnv* env) if (IS_NOT_NULL(qn->head_exact)) { r = add_op(reg, OP_PUSH_OR_JUMP_EXACT1); if (r != 0) return r; - COP(reg)->push_or_jump_exact1.addr = SIZE_INC_OP + mod_tlen + SIZE_OP_JUMP; + COP(reg)->push_or_jump_exact1.addr = SIZE_INC + mod_tlen + OPSIZE_JUMP; COP(reg)->push_or_jump_exact1.c = STR_(qn->head_exact)->s[0]; - r = compile_tree_empty_check(NODE_QUANT_BODY(qn), reg, emptiness, env); + r = compile_quant_body_with_empty_check(qn, reg, env); if (r != 0) return r; - addr = -(mod_tlen + (int )SIZE_OP_PUSH_OR_JUMP_EXACT1); + addr = -(mod_tlen + (int )OPSIZE_PUSH_OR_JUMP_EXACT1); } else #endif if (IS_NOT_NULL(qn->next_head_exact)) { r = add_op(reg, OP_PUSH_IF_PEEK_NEXT); if (r != 0) return r; - COP(reg)->push_if_peek_next.addr = SIZE_INC_OP + mod_tlen + SIZE_OP_JUMP; + COP(reg)->push_if_peek_next.addr = SIZE_INC + mod_tlen + OPSIZE_JUMP; COP(reg)->push_if_peek_next.c = STR_(qn->next_head_exact)->s[0]; - r = compile_tree_empty_check(NODE_QUANT_BODY(qn), reg, emptiness, env); + r = compile_quant_body_with_empty_check(qn, reg, env); if (r != 0) return r; - addr = -(mod_tlen + (int )SIZE_OP_PUSH_IF_PEEK_NEXT); + addr = -(mod_tlen + (int )OPSIZE_PUSH_IF_PEEK_NEXT); } else { r = add_op(reg, OP_PUSH); if (r != 0) return r; - COP(reg)->push.addr = SIZE_INC_OP + mod_tlen + SIZE_OP_JUMP; + COP(reg)->push.addr = SIZE_INC + mod_tlen + OPSIZE_JUMP; - r = compile_tree_empty_check(NODE_QUANT_BODY(qn), reg, emptiness, env); + r = compile_quant_body_with_empty_check(qn, reg, env); if (r != 0) return r; - addr = -(mod_tlen + (int )SIZE_OP_PUSH); + addr = -(mod_tlen + (int )OPSIZE_PUSH); } r = add_op(reg, OP_JUMP); @@ -1152,9 +1272,9 @@ compile_quantifier_node(QuantNode* qn, regex_t* reg, ScanEnv* env) else { r = add_op(reg, OP_JUMP); if (r != 0) return r; - COP(reg)->jump.addr = mod_tlen + SIZE_INC_OP; + COP(reg)->jump.addr = mod_tlen + SIZE_INC; - r = compile_tree_empty_check(NODE_QUANT_BODY(qn), reg, emptiness, env); + r = compile_quant_body_with_empty_check(qn, reg, env); if (r != 0) return r; r = add_op(reg, OP_PUSH); @@ -1163,10 +1283,10 @@ compile_quantifier_node(QuantNode* qn, regex_t* reg, ScanEnv* env) } } else if (qn->upper == 0) { - if (qn->is_refered != 0) { /* /(?<n>..){0}/ */ + if (qn->include_referred != 0) { /* /(?<n>..){0}/ */ r = add_op(reg, OP_JUMP); if (r != 0) return r; - COP(reg)->jump.addr = tlen + SIZE_INC_OP; + COP(reg)->jump.addr = tlen + SIZE_INC; r = compile_tree(NODE_QUANT_BODY(qn), reg, env); } @@ -1177,7 +1297,7 @@ compile_quantifier_node(QuantNode* qn, regex_t* reg, ScanEnv* env) } else if (! infinite && qn->greedy && (qn->upper == 1 || - int_multiply_cmp(tlen + SIZE_OP_PUSH, qn->upper, + int_multiply_cmp(tlen + OPSIZE_PUSH, qn->upper, QUANTIFIER_EXPAND_LIMIT_SIZE) <= 0)) { int n = qn->upper - qn->lower; @@ -1185,7 +1305,7 @@ compile_quantifier_node(QuantNode* qn, regex_t* reg, ScanEnv* env) if (r != 0) return r; for (i = 0; i < n; i++) { - int v = onig_positive_int_multiply(n - i, tlen + SIZE_OP_PUSH); + int v = onig_positive_int_multiply(n - i, tlen + OPSIZE_PUSH); if (v < 0) return ONIGERR_TOO_BIG_NUMBER_FOR_REPEAT_RANGE; r = add_op(reg, OP_PUSH); @@ -1199,11 +1319,11 @@ compile_quantifier_node(QuantNode* qn, regex_t* reg, ScanEnv* env) else if (! qn->greedy && qn->upper == 1 && qn->lower == 0) { /* '??' */ r = add_op(reg, OP_PUSH); if (r != 0) return r; - COP(reg)->push.addr = SIZE_INC_OP + SIZE_OP_JUMP; + COP(reg)->push.addr = SIZE_INC + OPSIZE_JUMP; r = add_op(reg, OP_JUMP); if (r != 0) return r; - COP(reg)->jump.addr = tlen + SIZE_INC_OP; + COP(reg)->jump.addr = tlen + SIZE_INC; r = compile_tree(NODE_QUANT_BODY(qn), reg, env); } @@ -1260,35 +1380,35 @@ compile_length_bag_node(BagNode* node, regex_t* reg) #ifdef USE_CALL if (node->m.regnum == 0 && NODE_IS_CALLED(node)) { - len = tlen + SIZE_OP_CALL + SIZE_OP_JUMP + SIZE_OP_RETURN; + len = tlen + OPSIZE_CALL + OPSIZE_JUMP + OPSIZE_RETURN; return len; } if (NODE_IS_CALLED(node)) { - len = SIZE_OP_MEMORY_START_PUSH + tlen - + SIZE_OP_CALL + SIZE_OP_JUMP + SIZE_OP_RETURN; - if (MEM_STATUS_AT0(reg->bt_mem_end, node->m.regnum)) + len = OPSIZE_MEM_START_PUSH + tlen + + OPSIZE_CALL + OPSIZE_JUMP + OPSIZE_RETURN; + if (MEM_STATUS_AT0(reg->push_mem_end, node->m.regnum)) len += (NODE_IS_RECURSION(node) - ? SIZE_OP_MEMORY_END_PUSH_REC : SIZE_OP_MEMORY_END_PUSH); + ? OPSIZE_MEM_END_PUSH_REC : OPSIZE_MEM_END_PUSH); else len += (NODE_IS_RECURSION(node) - ? SIZE_OP_MEMORY_END_REC : SIZE_OP_MEMORY_END); + ? OPSIZE_MEM_END_REC : OPSIZE_MEM_END); } else if (NODE_IS_RECURSION(node)) { - len = SIZE_OP_MEMORY_START_PUSH; - len += tlen + (MEM_STATUS_AT0(reg->bt_mem_end, node->m.regnum) - ? SIZE_OP_MEMORY_END_PUSH_REC : SIZE_OP_MEMORY_END_REC); + len = OPSIZE_MEM_START_PUSH; + len += tlen + (MEM_STATUS_AT0(reg->push_mem_end, node->m.regnum) + ? OPSIZE_MEM_END_PUSH_REC : OPSIZE_MEM_END_REC); } else #endif { - if (MEM_STATUS_AT0(reg->bt_mem_start, node->m.regnum)) - len = SIZE_OP_MEMORY_START_PUSH; + if (MEM_STATUS_AT0(reg->push_mem_start, node->m.regnum)) + len = OPSIZE_MEM_START_PUSH; else - len = SIZE_OP_MEMORY_START; + len = OPSIZE_MEM_START; - len += tlen + (MEM_STATUS_AT0(reg->bt_mem_end, node->m.regnum) - ? SIZE_OP_MEMORY_END_PUSH : SIZE_OP_MEMORY_END); + len += tlen + (MEM_STATUS_AT0(reg->push_mem_end, node->m.regnum) + ? OPSIZE_MEM_END_PUSH : OPSIZE_MEM_END); } break; @@ -1303,10 +1423,10 @@ compile_length_bag_node(BagNode* node, regex_t* reg) v = onig_positive_int_multiply(qn->lower, tlen); if (v < 0) return ONIGERR_TOO_BIG_NUMBER_FOR_REPEAT_RANGE; - len = v + SIZE_OP_PUSH + tlen + SIZE_OP_POP_OUT + SIZE_OP_JUMP; + len = v + OPSIZE_PUSH + tlen + OPSIZE_POP_OUT + OPSIZE_JUMP; } else { - len = SIZE_OP_ATOMIC_START + tlen + SIZE_OP_ATOMIC_END; + len = OPSIZE_ATOMIC_START + tlen + OPSIZE_ATOMIC_END; } break; @@ -1318,8 +1438,8 @@ compile_length_bag_node(BagNode* node, regex_t* reg) len = compile_length_tree(cond, reg); if (len < 0) return len; - len += SIZE_OP_PUSH; - len += SIZE_OP_ATOMIC_START + SIZE_OP_ATOMIC_END; + len += OPSIZE_PUSH; + len += OPSIZE_ATOMIC_START + OPSIZE_ATOMIC_END; if (IS_NOT_NULL(Then)) { tlen = compile_length_tree(Then, reg); @@ -1327,7 +1447,7 @@ compile_length_bag_node(BagNode* node, regex_t* reg) len += tlen; } - len += SIZE_OP_JUMP + SIZE_OP_ATOMIC_END; + len += OPSIZE_JUMP + OPSIZE_ATOMIC_END; if (IS_NOT_NULL(Else)) { tlen = compile_length_tree(Else, reg); @@ -1352,24 +1472,25 @@ static int compile_bag_memory_node(BagNode* node, regex_t* reg, ScanEnv* env) { int r; - int len; #ifdef USE_CALL if (NODE_IS_CALLED(node)) { + int len; + r = add_op(reg, OP_CALL); if (r != 0) return r; - node->m.called_addr = COP_CURR_OFFSET(reg) + 1 + SIZE_OP_JUMP; + node->m.called_addr = COP_CURR_OFFSET(reg) + 1 + OPSIZE_JUMP; NODE_STATUS_ADD(node, ADDR_FIXED); COP(reg)->call.addr = (int )node->m.called_addr; if (node->m.regnum == 0) { len = compile_length_tree(NODE_BAG_BODY(node), reg); - len += SIZE_OP_RETURN; + len += OPSIZE_RETURN; r = add_op(reg, OP_JUMP); if (r != 0) return r; - COP(reg)->jump.addr = len + SIZE_INC_OP; + COP(reg)->jump.addr = len + SIZE_INC; r = compile_tree(NODE_BAG_BODY(node), reg, env); if (r != 0) return r; @@ -1379,25 +1500,24 @@ compile_bag_memory_node(BagNode* node, regex_t* reg, ScanEnv* env) } else { len = compile_length_tree(NODE_BAG_BODY(node), reg); - len += (SIZE_OP_MEMORY_START_PUSH + SIZE_OP_RETURN); - if (MEM_STATUS_AT0(reg->bt_mem_end, node->m.regnum)) + len += (OPSIZE_MEM_START_PUSH + OPSIZE_RETURN); + if (MEM_STATUS_AT0(reg->push_mem_end, node->m.regnum)) len += (NODE_IS_RECURSION(node) - ? SIZE_OP_MEMORY_END_PUSH_REC : SIZE_OP_MEMORY_END_PUSH); + ? OPSIZE_MEM_END_PUSH_REC : OPSIZE_MEM_END_PUSH); else - len += (NODE_IS_RECURSION(node) - ? SIZE_OP_MEMORY_END_REC : SIZE_OP_MEMORY_END); + len += (NODE_IS_RECURSION(node) ? OPSIZE_MEM_END_REC : OPSIZE_MEM_END); r = add_op(reg, OP_JUMP); if (r != 0) return r; - COP(reg)->jump.addr = len + SIZE_INC_OP; + COP(reg)->jump.addr = len + SIZE_INC; } } #endif - if (MEM_STATUS_AT0(reg->bt_mem_start, node->m.regnum)) - r = add_op(reg, OP_MEMORY_START_PUSH); + if (MEM_STATUS_AT0(reg->push_mem_start, node->m.regnum)) + r = add_op(reg, OP_MEM_START_PUSH); else - r = add_op(reg, OP_MEMORY_START); + r = add_op(reg, OP_MEM_START); if (r != 0) return r; COP(reg)->memory_start.num = node->m.regnum; @@ -1405,11 +1525,11 @@ compile_bag_memory_node(BagNode* node, regex_t* reg, ScanEnv* env) if (r != 0) return r; #ifdef USE_CALL - if (MEM_STATUS_AT0(reg->bt_mem_end, node->m.regnum)) + if (MEM_STATUS_AT0(reg->push_mem_end, node->m.regnum)) r = add_op(reg, (NODE_IS_RECURSION(node) - ? OP_MEMORY_END_PUSH_REC : OP_MEMORY_END_PUSH)); + ? OP_MEM_END_PUSH_REC : OP_MEM_END_PUSH)); else - r = add_op(reg, (NODE_IS_RECURSION(node) ? OP_MEMORY_END_REC : OP_MEMORY_END)); + r = add_op(reg, (NODE_IS_RECURSION(node) ? OP_MEM_END_REC : OP_MEM_END)); if (r != 0) return r; COP(reg)->memory_end.num = node->m.regnum; @@ -1418,10 +1538,10 @@ compile_bag_memory_node(BagNode* node, regex_t* reg, ScanEnv* env) r = add_op(reg, OP_RETURN); } #else - if (MEM_STATUS_AT0(reg->bt_mem_end, node->m.regnum)) - r = add_op(reg, OP_MEMORY_END_PUSH); + if (MEM_STATUS_AT0(reg->push_mem_end, node->m.regnum)) + r = add_op(reg, OP_MEM_END_PUSH); else - r = add_op(reg, OP_MEMORY_END); + r = add_op(reg, OP_MEM_END); if (r != 0) return r; COP(reg)->memory_end.num = node->m.regnum; #endif @@ -1454,7 +1574,7 @@ compile_bag_node(BagNode* node, regex_t* reg, ScanEnv* env) r = add_op(reg, OP_PUSH); if (r != 0) return r; - COP(reg)->push.addr = SIZE_INC_OP + len + SIZE_OP_POP_OUT + SIZE_OP_JUMP; + COP(reg)->push.addr = SIZE_INC + len + OPSIZE_POP_OUT + OPSIZE_JUMP; r = compile_tree(NODE_QUANT_BODY(qn), reg, env); if (r != 0) return r; @@ -1463,7 +1583,7 @@ compile_bag_node(BagNode* node, regex_t* reg, ScanEnv* env) r = add_op(reg, OP_JUMP); if (r != 0) return r; - COP(reg)->jump.addr = -((int )SIZE_OP_PUSH + len + (int )SIZE_OP_POP_OUT); + COP(reg)->jump.addr = -((int )OPSIZE_PUSH + len + (int )OPSIZE_POP_OUT); } else { r = add_op(reg, OP_ATOMIC_START); @@ -1493,11 +1613,11 @@ compile_bag_node(BagNode* node, regex_t* reg, ScanEnv* env) else then_len = 0; - jump_len = cond_len + then_len + SIZE_OP_ATOMIC_END + SIZE_OP_JUMP; + jump_len = cond_len + then_len + OPSIZE_ATOMIC_END + OPSIZE_JUMP; r = add_op(reg, OP_PUSH); if (r != 0) return r; - COP(reg)->push.addr = SIZE_INC_OP + jump_len; + COP(reg)->push.addr = SIZE_INC + jump_len; r = compile_tree(cond, reg, env); if (r != 0) return r; @@ -1518,7 +1638,7 @@ compile_bag_node(BagNode* node, regex_t* reg, ScanEnv* env) r = add_op(reg, OP_JUMP); if (r != 0) return r; - COP(reg)->jump.addr = SIZE_OP_ATOMIC_END + else_len + SIZE_INC_OP; + COP(reg)->jump.addr = OPSIZE_ATOMIC_END + else_len + SIZE_INC; r = add_op(reg, OP_ATOMIC_END); if (r != 0) return r; @@ -1546,16 +1666,16 @@ compile_length_anchor_node(AnchorNode* node, regex_t* reg) switch (node->type) { case ANCR_PREC_READ: - len = SIZE_OP_PREC_READ_START + tlen + SIZE_OP_PREC_READ_END; + len = OPSIZE_PREC_READ_START + tlen + OPSIZE_PREC_READ_END; break; case ANCR_PREC_READ_NOT: - len = SIZE_OP_PREC_READ_NOT_START + tlen + SIZE_OP_PREC_READ_NOT_END; + len = OPSIZE_PREC_READ_NOT_START + tlen + OPSIZE_PREC_READ_NOT_END; break; case ANCR_LOOK_BEHIND: - len = SIZE_OP_LOOK_BEHIND + tlen; + len = OPSIZE_LOOK_BEHIND + tlen; break; case ANCR_LOOK_BEHIND_NOT: - len = SIZE_OP_LOOK_BEHIND_NOT_START + tlen + SIZE_OP_LOOK_BEHIND_NOT_END; + len = OPSIZE_LOOK_BEHIND_NOT_START + tlen + OPSIZE_LOOK_BEHIND_NOT_END; break; case ANCR_WORD_BOUNDARY: @@ -1564,7 +1684,7 @@ compile_length_anchor_node(AnchorNode* node, regex_t* reg) case ANCR_WORD_BEGIN: case ANCR_WORD_END: #endif - len = SIZE_OP_WORD_BOUNDARY; + len = OPSIZE_WORD_BOUNDARY; break; case ANCR_TEXT_SEGMENT_BOUNDARY: @@ -1648,7 +1768,7 @@ compile_anchor_node(AnchorNode* node, regex_t* reg, ScanEnv* env) r = add_op(reg, OP_PREC_READ_NOT_START); if (r != 0) return r; - COP(reg)->prec_read_not_start.addr = SIZE_INC_OP + len + SIZE_OP_PREC_READ_NOT_END; + COP(reg)->prec_read_not_start.addr = SIZE_INC + len + OPSIZE_PREC_READ_NOT_END; r = compile_tree(NODE_ANCHOR_BODY(node), reg, env); if (r != 0) return r; r = add_op(reg, OP_PREC_READ_NOT_END); @@ -1678,7 +1798,7 @@ compile_anchor_node(AnchorNode* node, regex_t* reg, ScanEnv* env) len = compile_length_tree(NODE_ANCHOR_BODY(node), reg); r = add_op(reg, OP_LOOK_BEHIND_NOT_START); if (r != 0) return r; - COP(reg)->look_behind_not_start.addr = SIZE_INC_OP + len + SIZE_OP_LOOK_BEHIND_NOT_END; + COP(reg)->look_behind_not_start.addr = SIZE_INC + len + OPSIZE_LOOK_BEHIND_NOT_END; if (node->char_len < 0) { r = get_char_len_node(NODE_ANCHOR_BODY(node), reg, &n); @@ -1764,25 +1884,25 @@ compile_length_gimmick_node(GimmickNode* node, regex_t* reg) switch (node->type) { case GIMMICK_FAIL: - len = SIZE_OP_FAIL; + len = OPSIZE_FAIL; break; case GIMMICK_SAVE: - len = SIZE_OP_PUSH_SAVE_VAL; + len = OPSIZE_PUSH_SAVE_VAL; break; case GIMMICK_UPDATE_VAR: - len = SIZE_OP_UPDATE_VAR; + len = OPSIZE_UPDATE_VAR; break; #ifdef USE_CALLOUT case GIMMICK_CALLOUT: switch (node->detail_type) { case ONIG_CALLOUT_OF_CONTENTS: - len = SIZE_OP_CALLOUT_CONTENTS; + len = OPSIZE_CALLOUT_CONTENTS; break; case ONIG_CALLOUT_OF_NAME: - len = SIZE_OP_CALLOUT_NAME; + len = OPSIZE_CALLOUT_NAME; break; default: @@ -1821,13 +1941,13 @@ compile_length_tree(Node* node, regex_t* reg) r += compile_length_tree(NODE_CAR(node), reg); n++; } while (IS_NOT_NULL(node = NODE_CDR(node))); - r += (SIZE_OP_PUSH + SIZE_OP_JUMP) * (n - 1); + r += (OPSIZE_PUSH + OPSIZE_JUMP) * (n - 1); } break; case NODE_STRING: - if (NODE_STRING_IS_RAW(node)) - r = compile_length_string_raw_node(STR_(node), reg); + if (NODE_STRING_IS_CRUDE(node)) + r = compile_length_string_crude_node(STR_(node), reg); else r = compile_length_string_node(node, reg); break; @@ -1841,12 +1961,12 @@ compile_length_tree(Node* node, regex_t* reg) break; case NODE_BACKREF: - r = SIZE_OP_BACKREF; + r = OPSIZE_BACKREF; break; #ifdef USE_CALL case NODE_CALL: - r = SIZE_OP_CALL; + r = OPSIZE_CALL; break; #endif @@ -1893,7 +2013,7 @@ compile_tree(Node* node, regex_t* reg, ScanEnv* env) do { len += compile_length_tree(NODE_CAR(x), reg); if (IS_NOT_NULL(NODE_CDR(x))) { - len += SIZE_OP_PUSH + SIZE_OP_JUMP; + len += OPSIZE_PUSH + OPSIZE_JUMP; } } while (IS_NOT_NULL(x = NODE_CDR(x))); pos = COP_CURR_OFFSET(reg) + 1 + len; /* goal position */ @@ -1904,7 +2024,7 @@ compile_tree(Node* node, regex_t* reg, ScanEnv* env) enum OpCode push = NODE_IS_SUPER(node) ? OP_PUSH_SUPER : OP_PUSH; r = add_op(reg, push); if (r != 0) break; - COP(reg)->push.addr = SIZE_INC_OP + len + SIZE_OP_JUMP; + COP(reg)->push.addr = SIZE_INC + len + OPSIZE_JUMP; } r = compile_tree(NODE_CAR(node), reg, env); if (r != 0) break; @@ -1919,8 +2039,8 @@ compile_tree(Node* node, regex_t* reg, ScanEnv* env) break; case NODE_STRING: - if (NODE_STRING_IS_RAW(node)) - r = compile_string_raw_node(STR_(node), reg); + if (NODE_STRING_IS_CRUDE(node)) + r = compile_string_crude_node(STR_(node), reg); else r = compile_string_node(node, reg); break; @@ -2090,8 +2210,9 @@ noname_disable_map(Node** plink, GroupNumRemap* map, int* counter) Node** ptarget = &(NODE_BODY(node)); Node* old = *ptarget; r = noname_disable_map(ptarget, map, counter); + if (r != 0) return r; if (*ptarget != old && NODE_TYPE(*ptarget) == NODE_QUANT) { - onig_reduce_nested_quantifier(node, *ptarget); + r = onig_reduce_nested_quantifier(node); } } break; @@ -2303,11 +2424,11 @@ disable_noname_group_capture(Node** root, regex_t* reg, ScanEnv* env) } } - loc = env->capture_history; - MEM_STATUS_CLEAR(env->capture_history); + loc = env->cap_history; + MEM_STATUS_CLEAR(env->cap_history); for (i = 1; i <= ONIG_MAX_CAPTURE_HISTORY_GROUP; i++) { if (MEM_STATUS_AT(loc, i)) { - MEM_STATUS_ON_SIMPLE(env->capture_history, map[i].new_val); + MEM_STATUS_ON_SIMPLE(env->cap_history, map[i].new_val); } } @@ -2683,7 +2804,7 @@ is_exclusive(Node* x, Node* y, regex_t* reg) len = NODE_STRING_LEN(x); if (len > NODE_STRING_LEN(y)) len = NODE_STRING_LEN(y); - if (NODE_STRING_IS_AMBIG(x) || NODE_STRING_IS_AMBIG(y)) { + if (NODE_STRING_IS_CASE_FOLD_MATCH(x) || NODE_STRING_IS_CASE_FOLD_MATCH(y)) { /* tiny version */ return 0; } @@ -2743,7 +2864,7 @@ get_head_value_node(Node* node, int exact, regex_t* reg) break; if (exact == 0 || - ! IS_IGNORECASE(reg->options) || NODE_STRING_IS_RAW(node)) { + ! IS_IGNORECASE(reg->options) || NODE_STRING_IS_CRUDE(node)) { n = node; } } @@ -2871,9 +2992,9 @@ tree_min_len(Node* node, ScanEnv* env) if (NODE_IS_RECURSION(node)) break; backs = BACKREFS_P(br); - len = tree_min_len(mem_env[backs[0]].node, env); + len = tree_min_len(mem_env[backs[0]].mem_node, env); for (i = 1; i < br->back_num; i++) { - tmin = tree_min_len(mem_env[backs[i]].node, env); + tmin = tree_min_len(mem_env[backs[i]].mem_node, env); if (len > tmin) len = tmin; } } @@ -3042,7 +3163,7 @@ tree_max_len(Node* node, ScanEnv* env) } backs = BACKREFS_P(br); for (i = 0; i < br->back_num; i++) { - tmax = tree_max_len(mem_env[backs[i]].node, env); + tmax = tree_max_len(mem_env[backs[i]].mem_node, env); if (len < tmax) len = tmax; } } @@ -3179,7 +3300,7 @@ check_backrefs(Node* node, ScanEnv* env) if (backs[i] > env->num_mem) return ONIGERR_INVALID_BACKREF; - NODE_STATUS_ADD(mem_env[backs[i]].node, BACKREF); + NODE_STATUS_ADD(mem_env[backs[i]].mem_node, BACKREF); } r = 0; } @@ -3193,6 +3314,204 @@ check_backrefs(Node* node, ScanEnv* env) return r; } +static int +set_empty_repeat_node_trav(Node* node, Node* empty, ScanEnv* env) +{ + int r; + + switch (NODE_TYPE(node)) { + case NODE_LIST: + case NODE_ALT: + do { + r = set_empty_repeat_node_trav(NODE_CAR(node), empty, env); + } while (r == 0 && IS_NOT_NULL(node = NODE_CDR(node))); + break; + + case NODE_ANCHOR: + { + AnchorNode* an = ANCHOR_(node); + + if (! ANCHOR_HAS_BODY(an)) { + r = 0; + break; + } + + switch (an->type) { + case ANCR_PREC_READ: + case ANCR_LOOK_BEHIND: + empty = NULL_NODE; + break; + default: + break; + } + r = set_empty_repeat_node_trav(NODE_BODY(node), empty, env); + } + break; + + case NODE_QUANT: + { + QuantNode* qn = QUANT_(node); + + if (qn->emptiness != BODY_IS_NOT_EMPTY) empty = node; + r = set_empty_repeat_node_trav(NODE_BODY(node), empty, env); + } + break; + + case NODE_BAG: + if (IS_NOT_NULL(NODE_BODY(node))) { + r = set_empty_repeat_node_trav(NODE_BODY(node), empty, env); + if (r != 0) return r; + } + { + BagNode* en = BAG_(node); + + if (en->type == BAG_MEMORY) { + if (NODE_IS_BACKREF(node)) { + if (IS_NOT_NULL(empty)) + SCANENV_MEMENV(env)[en->m.regnum].empty_repeat_node = empty; + } + } + else if (en->type == BAG_IF_ELSE) { + if (IS_NOT_NULL(en->te.Then)) { + r = set_empty_repeat_node_trav(en->te.Then, empty, env); + if (r != 0) return r; + } + if (IS_NOT_NULL(en->te.Else)) { + r = set_empty_repeat_node_trav(en->te.Else, empty, env); + } + } + } + break; + + default: + r = 0; + break; + } + + return r; +} + +static int +is_ancestor_node(Node* node, Node* me) +{ + Node* parent; + + while ((parent = NODE_PARENT(me)) != NULL_NODE) { + if (parent == node) return 1; + me = parent; + } + return 0; +} + +static void +set_empty_status_check_trav(Node* node, ScanEnv* env) +{ + switch (NODE_TYPE(node)) { + case NODE_LIST: + case NODE_ALT: + do { + set_empty_status_check_trav(NODE_CAR(node), env); + } while (IS_NOT_NULL(node = NODE_CDR(node))); + break; + + case NODE_ANCHOR: + { + AnchorNode* an = ANCHOR_(node); + + if (! ANCHOR_HAS_BODY(an)) break; + set_empty_status_check_trav(NODE_BODY(node), env); + } + break; + + case NODE_QUANT: + set_empty_status_check_trav(NODE_BODY(node), env); + break; + + case NODE_BAG: + if (IS_NOT_NULL(NODE_BODY(node))) + set_empty_status_check_trav(NODE_BODY(node), env); + { + BagNode* en = BAG_(node); + + if (en->type == BAG_IF_ELSE) { + if (IS_NOT_NULL(en->te.Then)) { + set_empty_status_check_trav(en->te.Then, env); + } + if (IS_NOT_NULL(en->te.Else)) { + set_empty_status_check_trav(en->te.Else, env); + } + } + } + break; + + case NODE_BACKREF: + { + int i; + int* backs; + MemEnv* mem_env = SCANENV_MEMENV(env); + BackRefNode* br = BACKREF_(node); + backs = BACKREFS_P(br); + for (i = 0; i < br->back_num; i++) { + Node* ernode = mem_env[backs[i]].empty_repeat_node; + if (IS_NOT_NULL(ernode)) { + if (! is_ancestor_node(ernode, node)) { + MEM_STATUS_LIMIT_ON(env->reg->empty_status_mem, backs[i]); + NODE_STATUS_ADD(ernode, EMPTY_STATUS_CHECK); + NODE_STATUS_ADD(mem_env[backs[i]].mem_node, EMPTY_STATUS_CHECK); + } + } + } + } + break; + + default: + break; + } +} + +static void +set_parent_node_trav(Node* node, Node* parent) +{ + NODE_PARENT(node) = parent; + + switch (NODE_TYPE(node)) { + case NODE_LIST: + case NODE_ALT: + do { + set_parent_node_trav(NODE_CAR(node), node); + } while (IS_NOT_NULL(node = NODE_CDR(node))); + break; + + case NODE_ANCHOR: + if (! ANCHOR_HAS_BODY(ANCHOR_(node))) break; + set_parent_node_trav(NODE_BODY(node), node); + break; + + case NODE_QUANT: + set_parent_node_trav(NODE_BODY(node), node); + break; + + case NODE_BAG: + if (IS_NOT_NULL(NODE_BODY(node))) + set_parent_node_trav(NODE_BODY(node), node); + { + BagNode* en = BAG_(node); + + if (en->type == BAG_IF_ELSE) { + if (IS_NOT_NULL(en->te.Then)) + set_parent_node_trav(en->te.Then, node); + if (IS_NOT_NULL(en->te.Else)) { + set_parent_node_trav(en->te.Else, node); + } + } + } + break; + + default: + break; + } +} + #ifdef USE_CALL @@ -3298,6 +3617,9 @@ infinite_recursive_call_check(Node* node, ScanEnv* env, int head) if ((eret & RECURSION_MUST) == 0) r &= ~RECURSION_MUST; } + else { + r &= ~RECURSION_MUST; + } } else { r = infinite_recursive_call_check(NODE_BODY(node), env, head); @@ -3472,7 +3794,7 @@ recursive_call_check_trav(Node* node, ScanEnv* env, int state) r = recursive_call_check_trav(NODE_BODY(node), env, state); if (QUANT_(node)->upper == 0) { if (r == FOUND_CALLED_NODE) - QUANT_(node)->is_refered = 1; + QUANT_(node)->include_referred = 1; } break; @@ -3495,8 +3817,10 @@ recursive_call_check_trav(Node* node, ScanEnv* env, int state) if (! NODE_IS_RECURSION(node)) { NODE_STATUS_ADD(node, MARK1); r = recursive_call_check(NODE_BODY(node)); - if (r != 0) + if (r != 0) { NODE_STATUS_ADD(node, RECURSION); + MEM_STATUS_ON(env->backtrack_mem, en->m.regnum); + } NODE_STATUS_REMOVE(node, MARK1); } @@ -3537,6 +3861,96 @@ recursive_call_check_trav(Node* node, ScanEnv* env, int state) #endif +static void +remove_from_list(Node* prev, Node* a) +{ + if (NODE_CDR(prev) != a) return ; + + NODE_CDR(prev) = NODE_CDR(a); + NODE_CDR(a) = NULL_NODE; +} + +static int +reduce_string_list(Node* node) +{ + int r = 0; + + switch (NODE_TYPE(node)) { + case NODE_LIST: + { + Node* prev; + Node* curr; + Node* prev_node; + Node* next_node; + + prev = NULL_NODE; + do { + next_node = NODE_CDR(node); + curr = NODE_CAR(node); + if (NODE_TYPE(curr) == NODE_STRING) { + if (IS_NULL(prev) || STR_(curr)->flag != STR_(prev)->flag) { + prev = curr; + prev_node = node; + } + else { + r = node_str_node_cat(prev, curr); + if (r != 0) return r; + remove_from_list(prev_node, node); + onig_node_free(node); + } + } + else { + prev = NULL_NODE; + prev_node = node; + } + + node = next_node; + } while (r == 0 && IS_NOT_NULL(node)); + } + break; + + case NODE_ALT: + do { + r = reduce_string_list(NODE_CAR(node)); + } while (r == 0 && IS_NOT_NULL(node = NODE_CDR(node))); + break; + + case NODE_ANCHOR: + if (IS_NULL(NODE_BODY(node))) + break; + /* fall */ + case NODE_QUANT: + r = reduce_string_list(NODE_BODY(node)); + break; + + case NODE_BAG: + { + BagNode* en = BAG_(node); + + r = reduce_string_list(NODE_BODY(node)); + if (r != 0) return r; + + if (en->type == BAG_IF_ELSE) { + if (IS_NOT_NULL(en->te.Then)) { + r = reduce_string_list(en->te.Then); + if (r != 0) return r; + } + if (IS_NOT_NULL(en->te.Else)) { + r = reduce_string_list(en->te.Else); + if (r != 0) return r; + } + } + } + break; + + default: + break; + } + + return r; +} + + #define IN_ALT (1<<0) #define IN_NOT (1<<1) #define IN_REAL_REPEAT (1<<2) @@ -3559,7 +3973,7 @@ divide_look_behind_alternatives(Node* node) head = NODE_ANCHOR_BODY(an); np = NODE_CAR(head); - swap_node(node, head); + node_swap(node, head); NODE_CAR(node) = head; NODE_BODY(head) = np; @@ -3581,7 +3995,7 @@ divide_look_behind_alternatives(Node* node) } static int -setup_look_behind(Node* node, regex_t* reg, ScanEnv* env) +tune_look_behind(Node* node, regex_t* reg, ScanEnv* env) { int r, len; AnchorNode* an = ANCHOR_(node); @@ -3602,7 +4016,7 @@ setup_look_behind(Node* node, regex_t* reg, ScanEnv* env) } static int -next_setup(Node* node, Node* next_node, regex_t* reg) +tune_next(Node* node, Node* next_node, regex_t* reg) { NodeType type; @@ -3629,7 +4043,7 @@ next_setup(Node* node, Node* next_node, regex_t* reg) Node* en = onig_node_new_bag(BAG_STOP_BACKTRACK); CHECK_NULL_RETURN_MEMERR(en); NODE_STATUS_ADD(en, STRICT_REAL_REPEAT); - swap_node(node, en); + node_swap(node, en); NODE_BODY(node) = en; } } @@ -3649,23 +4063,57 @@ next_setup(Node* node, Node* next_node, regex_t* reg) static int -update_string_node_case_fold(regex_t* reg, Node *node) +is_all_code_len_1_items(int n, OnigCaseFoldCodeItem items[]) { - UChar *p, *end, buf[ONIGENC_MBC_CASE_FOLD_MAXLEN]; + int i; + + for (i = 0; i < n; i++) { + OnigCaseFoldCodeItem* item = items + i; + if (item->code_len != 1) return 0; + } + + return 1; +} + +static int +get_min_max_byte_len_case_fold_items(int n, OnigCaseFoldCodeItem items[], int* rmin, int* rmax) +{ + int i, len, minlen, maxlen; + + minlen = INT_MAX; + maxlen = 0; + for (i = 0; i < n; i++) { + OnigCaseFoldCodeItem* item = items + i; + + len = item->byte_len; + if (len < minlen) minlen = len; + if (len > maxlen) maxlen = len; + } + + *rmin = minlen; + *rmax = maxlen; + return 0; +} + +static int +conv_string_case_fold(OnigEncoding enc, OnigCaseFoldType case_fold_flag, + UChar* s, UChar* end, UChar** rs, UChar** rend, int* rcase_min_len) +{ + UChar *p, buf[ONIGENC_MBC_CASE_FOLD_MAXLEN]; UChar *sbuf, *ebuf, *sp; - int r, i, len, sbuf_size; - StrNode* sn = STR_(node); + int i, n, len, sbuf_size; - end = sn->end; - sbuf_size = (int )(end - sn->s) * 2; + *rs = NULL; + sbuf_size = (int )(end - s) * 2; sbuf = (UChar* )xmalloc(sbuf_size); CHECK_NULL_RETURN_MEMERR(sbuf); ebuf = sbuf + sbuf_size; + n = 0; sp = sbuf; - p = sn->s; + p = s; while (p < end) { - len = ONIGENC_MBC_CASE_FOLD(reg->enc, reg->case_fold_flag, &p, end, buf); + len = ONIGENC_MBC_CASE_FOLD(enc, case_fold_flag, &p, end, buf); for (i = 0; i < len; i++) { if (sp >= ebuf) { sbuf = (UChar* )xrealloc(sbuf, sbuf_size * 2); @@ -3677,356 +4125,302 @@ update_string_node_case_fold(regex_t* reg, Node *node) *sp++ = buf[i]; } + n++; } - r = onig_node_str_set(node, sbuf, sp); - if (r != 0) { - xfree(sbuf); - return r; - } - - xfree(sbuf); + *rs = sbuf; + *rend = sp; + *rcase_min_len = n; return 0; } static int -expand_case_fold_make_rem_string(Node** rnode, UChar *s, UChar *end, regex_t* reg) +make_code_list_to_string(Node** rnode, OnigEncoding enc, + int n, OnigCodePoint codes[]) { - int r; - Node *node; + int r, i, len; + Node* node; + UChar buf[ONIGENC_CODE_TO_MBC_MAXLEN]; - node = onig_node_new_str(s, end); - if (IS_NULL(node)) return ONIGERR_MEMORY; + *rnode = NULL_NODE; + node = onig_node_new_str(NULL, NULL); + CHECK_NULL_RETURN_MEMERR(node); - r = update_string_node_case_fold(reg, node); - if (r != 0) { - onig_node_free(node); - return r; + for (i = 0; i < n; i++) { + len = ONIGENC_CODE_TO_MBC(enc, codes[i], buf); + if (len < 0) { + r = len; + goto err; + } + + r = onig_node_str_cat(node, buf, buf + len); + if (r != 0) goto err; } - NODE_STRING_SET_AMBIG(node); - NODE_STRING_SET_DONT_GET_OPT_INFO(node); *rnode = node; return 0; + + err: + onig_node_free(node); + return r; } static int -expand_case_fold_string_alt(int item_num, OnigCaseFoldCodeItem items[], UChar *p, - int slen, UChar *end, regex_t* reg, Node **rnode) +unravel_cf_node_add(Node** rlist, Node* add) { - int r, i, j; - int len; - int varlen; - Node *anode, *var_anode, *snode, *xnode, *an; - UChar buf[ONIGENC_CODE_TO_MBC_MAXLEN]; - - *rnode = var_anode = NULL_NODE; + Node *list; - varlen = 0; - for (i = 0; i < item_num; i++) { - if (items[i].byte_len != slen) { - varlen = 1; - break; - } + list = *rlist; + if (IS_NULL(list)) { + list = onig_node_new_list(add, NULL); + CHECK_NULL_RETURN_MEMERR(list); + *rlist = list; } + else { + Node* r = node_list_add(list, add); + CHECK_NULL_RETURN_MEMERR(r); + } + + return 0; +} - if (varlen != 0) { - *rnode = var_anode = onig_node_new_alt(NULL_NODE, NULL_NODE); - if (IS_NULL(var_anode)) return ONIGERR_MEMORY; +static int +unravel_cf_string_add(Node** rlist, Node** rsn, UChar* s, UChar* end, + unsigned int flag, int case_min_len) +{ + int r; + Node *sn, *list; - xnode = onig_node_new_list(NULL, NULL); - if (IS_NULL(xnode)) goto mem_err; - NODE_CAR(var_anode) = xnode; + list = *rlist; + sn = *rsn; - anode = onig_node_new_alt(NULL_NODE, NULL_NODE); - if (IS_NULL(anode)) goto mem_err; - NODE_CAR(xnode) = anode; + if (IS_NOT_NULL(sn) && STR_(sn)->flag == flag) { + if (NODE_STRING_IS_CASE_FOLD_MATCH(sn)) + r = node_str_cat_case_fold(sn, s, end, case_min_len); + else + r = onig_node_str_cat(sn, s, end); } else { - *rnode = anode = onig_node_new_alt(NULL_NODE, NULL_NODE); - if (IS_NULL(anode)) return ONIGERR_MEMORY; + sn = onig_node_new_str(s, end); + CHECK_NULL_RETURN_MEMERR(sn); + + STR_(sn)->flag = flag; + STR_(sn)->case_min_len = case_min_len; + r = unravel_cf_node_add(&list, sn); } - snode = onig_node_new_str(p, p + slen); - if (IS_NULL(snode)) goto mem_err; + if (r == 0) { + *rlist = list; + *rsn = sn; + } + return r; +} - NODE_CAR(anode) = snode; +static int +unravel_cf_string_fold_add(Node** rlist, Node** rsn, OnigEncoding enc, + OnigCaseFoldType case_fold_flag, UChar* s, UChar* end) +{ + int r; + int case_min_len; + UChar *rs, *rend; - for (i = 0; i < item_num; i++) { - snode = onig_node_new_str(NULL, NULL); - if (IS_NULL(snode)) goto mem_err; + r = conv_string_case_fold(enc, case_fold_flag, s, end, + &rs, &rend, &case_min_len); + if (r != 0) return r; - for (j = 0; j < items[i].code_len; j++) { - len = ONIGENC_CODE_TO_MBC(reg->enc, items[i].code[j], buf); - if (len < 0) { - r = len; - goto mem_err2; - } + r = unravel_cf_string_add(rlist, rsn, rs, rend, + NODE_STRING_CASE_FOLD_MATCH, case_min_len); + xfree(rs); - r = onig_node_str_cat(snode, buf, buf + len); - if (r != 0) goto mem_err2; - } + return r; +} - an = onig_node_new_alt(NULL_NODE, NULL_NODE); - if (IS_NULL(an)) { - goto mem_err2; - } +static int +unravel_cf_string_alt_or_cc_add(Node** rlist, int n, + OnigCaseFoldCodeItem items[], int byte_len, OnigEncoding enc, + OnigCaseFoldType case_fold_flag, UChar* s, UChar* end) +{ + int r, i; + Node* node; - if (items[i].byte_len != slen && IS_NOT_NULL(var_anode)) { - Node *rem; - UChar *q = p + items[i].byte_len; + if (is_all_code_len_1_items(n, items)) { + OnigCodePoint codes[14];/* least ONIGENC_GET_CASE_FOLD_CODES_MAX_NUM + 1 */ - if (q < end) { - r = expand_case_fold_make_rem_string(&rem, q, end, reg); - if (r != 0) { - onig_node_free(an); - goto mem_err2; - } + codes[0] = ONIGENC_MBC_TO_CODE(enc, s, end); + for (i = 0; i < n; i++) { + OnigCaseFoldCodeItem* item = items + i; + codes[i+1] = item->code[0]; + } + r = onig_new_cclass_with_code_list(&node, enc, n + 1, codes); + if (r != 0) return r; + } + else { + Node *snode, *alt, *curr; - xnode = onig_node_list_add(NULL_NODE, snode); - if (IS_NULL(xnode)) { - onig_node_free(an); - onig_node_free(rem); - goto mem_err2; - } - if (IS_NULL(onig_node_list_add(xnode, rem))) { - onig_node_free(an); - onig_node_free(xnode); - onig_node_free(rem); - goto mem_err; - } + snode = onig_node_new_str(s, end); + CHECK_NULL_RETURN_MEMERR(snode); + node = curr = onig_node_new_alt(snode, NULL_NODE); + if (IS_NULL(curr)) { + onig_node_free(snode); + return ONIGERR_MEMORY; + } - NODE_CAR(an) = xnode; + r = 0; + for (i = 0; i < n; i++) { + OnigCaseFoldCodeItem* item = items + i; + r = make_code_list_to_string(&snode, enc, item->code_len, item->code); + if (r != 0) { + onig_node_free(node); + return r; } - else { - NODE_CAR(an) = snode; + + alt = onig_node_new_alt(snode, NULL_NODE); + if (IS_NULL(alt)) { + onig_node_free(snode); + onig_node_free(node); + return ONIGERR_MEMORY; } - NODE_CDR(var_anode) = an; - var_anode = an; - } - else { - NODE_CAR(an) = snode; - NODE_CDR(anode) = an; - anode = an; + NODE_CDR(curr) = alt; + curr = alt; } } - return varlen; - - mem_err2: - onig_node_free(snode); - - mem_err: - onig_node_free(*rnode); - - return ONIGERR_MEMORY; + r = unravel_cf_node_add(rlist, node); + if (r != 0) onig_node_free(node); + return r; } static int -is_good_case_fold_items_for_search(OnigEncoding enc, int slen, - int n, OnigCaseFoldCodeItem items[]) +unravel_cf_look_behind_add(Node** rlist, Node** rsn, + int n, OnigCaseFoldCodeItem items[], OnigEncoding enc, + UChar* s, int one_len) { - int i, len; - UChar buf[ONIGENC_MBC_CASE_FOLD_MAXLEN]; + int r, i, found; + found = 0; for (i = 0; i < n; i++) { OnigCaseFoldCodeItem* item = items + i; + if (item->byte_len == one_len) { + if (item->code_len == 1) { + found = 1; + } + } + } - if (item->code_len != 1) return 0; - if (item->byte_len != slen) return 0; - len = ONIGENC_CODE_TO_MBC(enc, item->code[0], buf); - if (len != slen) return 0; + if (found == 0) { + r = unravel_cf_string_add(rlist, rsn, s, s + one_len, 0 /* flag */, 0); } + else { + Node* node; + OnigCodePoint codes[14];/* least ONIGENC_GET_CASE_FOLD_CODES_MAX_NUM + 1 */ - return 1; -} + found = 0; + codes[found++] = ONIGENC_MBC_TO_CODE(enc, s, s + one_len); + for (i = 0; i < n; i++) { + OnigCaseFoldCodeItem* item = items + i; + if (item->byte_len == one_len) { + if (item->code_len == 1) { + codes[found++] = item->code[0]; + } + } + } + r = onig_new_cclass_with_code_list(&node, enc, found, codes); + if (r != 0) return r; -#define THRESHOLD_CASE_FOLD_ALT_FOR_EXPANSION 8 + r = unravel_cf_node_add(rlist, node); + if (r != 0) onig_node_free(node); + + *rsn = NULL_NODE; + } + + return r; +} static int -expand_case_fold_string(Node* node, regex_t* reg, int state) -{ - int r, n, len, alt_num; - int fold_len; - int prev_is_ambig, prev_is_good, is_good, is_in_look_behind; - UChar *start, *end, *p; - UChar* foldp; - Node *top_root, *root, *snode, *prev_node; +unravel_case_fold_string(Node* node, regex_t* reg, int state) +{ + int r, n, one_len, min_len, max_len, in_look_behind; + UChar *start, *end, *p, *q; + StrNode* snode; + Node *sn, *list; + OnigEncoding enc; OnigCaseFoldCodeItem items[ONIGENC_GET_CASE_FOLD_CODES_MAX_NUM]; - UChar buf[ONIGENC_MBC_CASE_FOLD_MAXLEN]; - StrNode* sn; - if (NODE_STRING_IS_AMBIG(node)) return 0; + if (NODE_STRING_IS_CASE_EXPANDED(node)) return 0; - sn = STR_(node); + snode = STR_(node); - start = sn->s; - end = sn->end; + start = snode->s; + end = snode->end; if (start >= end) return 0; - is_in_look_behind = (state & IN_LOOK_BEHIND) != 0; + in_look_behind = (state & IN_LOOK_BEHIND) != 0; + enc = reg->enc; - r = 0; - top_root = root = prev_node = snode = NULL_NODE; - alt_num = 1; + list = sn = NULL_NODE; p = start; while (p < end) { - n = ONIGENC_GET_CASE_FOLD_CODES_BY_STR(reg->enc, reg->case_fold_flag, - p, end, items); + n = ONIGENC_GET_CASE_FOLD_CODES_BY_STR(enc, reg->case_fold_flag, p, end, + items); if (n < 0) { r = n; goto err; } - len = enclen(reg->enc, p); - is_good = is_good_case_fold_items_for_search(reg->enc, len, n, items); - - if (is_in_look_behind || - (IS_NOT_NULL(snode) || - (is_good - /* expand single char case: ex. /(?i:a)/ */ - && !(p == start && p + len >= end)))) { - if (IS_NULL(snode)) { - if (IS_NULL(root) && IS_NOT_NULL(prev_node)) { - top_root = root = onig_node_list_add(NULL_NODE, prev_node); - if (IS_NULL(root)) { - onig_node_free(prev_node); - goto mem_err; - } - } - - prev_node = snode = onig_node_new_str(NULL, NULL); - if (IS_NULL(snode)) goto mem_err; - if (IS_NOT_NULL(root)) { - if (IS_NULL(onig_node_list_add(root, snode))) { - onig_node_free(snode); - goto mem_err; - } - } - - prev_is_ambig = -1; /* -1: new */ - prev_is_good = 0; /* escape compiler warning */ - } - else { - prev_is_ambig = NODE_STRING_IS_AMBIG(snode); - prev_is_good = NODE_STRING_IS_GOOD_AMBIG(snode); - } - - if (n != 0) { - foldp = p; - fold_len = ONIGENC_MBC_CASE_FOLD(reg->enc, reg->case_fold_flag, - &foldp, end, buf); - foldp = buf; - } - else { - foldp = p; fold_len = len; - } - - if ((prev_is_ambig == 0 && n != 0) || - (prev_is_ambig > 0 && (n == 0 || prev_is_good != is_good))) { - if (IS_NULL(root) /* && IS_NOT_NULL(prev_node) */) { - top_root = root = onig_node_list_add(NULL_NODE, prev_node); - if (IS_NULL(root)) { - onig_node_free(prev_node); - goto mem_err; - } - } - - prev_node = snode = onig_node_new_str(foldp, foldp + fold_len); - if (IS_NULL(snode)) goto mem_err; - if (IS_NULL(onig_node_list_add(root, snode))) { - onig_node_free(snode); - goto mem_err; - } - } - else { - r = onig_node_str_cat(snode, foldp, foldp + fold_len); - if (r != 0) goto err; - } - - if (n != 0) NODE_STRING_SET_AMBIG(snode); - if (is_good != 0) NODE_STRING_SET_GOOD_AMBIG(snode); + one_len = enclen(enc, p); + if (n == 0) { + q = p + one_len; + r = unravel_cf_string_add(&list, &sn, p, q, 0 /* flag */, 0); + if (r != 0) goto err; } else { - alt_num *= (n + 1); - if (alt_num > THRESHOLD_CASE_FOLD_ALT_FOR_EXPANSION) break; - - if (IS_NULL(root) && IS_NOT_NULL(prev_node)) { - top_root = root = onig_node_list_add(NULL_NODE, prev_node); - if (IS_NULL(root)) { - onig_node_free(prev_node); - goto mem_err; - } + if (in_look_behind != 0) { + q = p + one_len; + r = unravel_cf_look_behind_add(&list, &sn, n, items, enc, p, one_len); + if (r != 0) goto err; } - - r = expand_case_fold_string_alt(n, items, p, len, end, reg, &prev_node); - if (r < 0) goto mem_err; - if (r == 1) { - if (IS_NULL(root)) { - top_root = prev_node; + else { + get_min_max_byte_len_case_fold_items(n, items, &min_len, &max_len); + q = p + max_len; + if (one_len == max_len && min_len == max_len) { + r = unravel_cf_string_alt_or_cc_add(&list, n, items, max_len, enc, + reg->case_fold_flag, p, q); + if (r != 0) goto err; + sn = NULL_NODE; } else { - if (IS_NULL(onig_node_list_add(root, prev_node))) { - onig_node_free(prev_node); - goto mem_err; - } - } - - root = NODE_CAR(prev_node); - } - else { /* r == 0 */ - if (IS_NOT_NULL(root)) { - if (IS_NULL(onig_node_list_add(root, prev_node))) { - onig_node_free(prev_node); - goto mem_err; - } + r = unravel_cf_string_fold_add(&list, &sn, enc, reg->case_fold_flag, + p, q); + if (r != 0) goto err; } } - - snode = NULL_NODE; } - p += len; + p = q; } - if (p < end) { - Node *srem; - - r = expand_case_fold_make_rem_string(&srem, p, end, reg); - if (r != 0) goto mem_err; - - if (IS_NOT_NULL(prev_node) && IS_NULL(root)) { - top_root = root = onig_node_list_add(NULL_NODE, prev_node); - if (IS_NULL(root)) { - onig_node_free(srem); - onig_node_free(prev_node); - goto mem_err; - } - } - - if (IS_NULL(root)) { - prev_node = srem; + if (IS_NOT_NULL(list)) { + if (node_list_len(list) == 1) { + node_swap(node, NODE_CAR(list)); } else { - if (IS_NULL(onig_node_list_add(root, srem))) { - onig_node_free(srem); - goto mem_err; - } + node_swap(node, list); } + onig_node_free(list); + } + else { + node_swap(node, sn); + onig_node_free(sn); } - - /* ending */ - top_root = (IS_NOT_NULL(top_root) ? top_root : prev_node); - swap_node(node, top_root); - onig_node_free(top_root); return 0; - mem_err: - r = ONIGERR_MEMORY; - err: - onig_node_free(top_root); + if (IS_NOT_NULL(list)) + onig_node_free(list); + else if (IS_NOT_NULL(sn)) + onig_node_free(sn); + return r; } @@ -4121,7 +4515,7 @@ quantifiers_memory_node_info(Node* node) __inline #endif static int -setup_call_node_call(CallNode* cn, ScanEnv* env, int state) +tune_call_node_call(CallNode* cn, ScanEnv* env, int state) { MemEnv* mem_env = SCANENV_MEMENV(env); @@ -4141,7 +4535,7 @@ setup_call_node_call(CallNode* cn, ScanEnv* env, int state) } set_call_attr: - NODE_CALL_BODY(cn) = mem_env[cn->group_num].node; + NODE_CALL_BODY(cn) = mem_env[cn->group_num].mem_node; if (IS_NULL(NODE_CALL_BODY(cn))) { onig_scan_env_set_error_string(env, ONIGERR_UNDEFINED_NAME_REFERENCE, cn->name, cn->name_end); @@ -4172,23 +4566,23 @@ setup_call_node_call(CallNode* cn, ScanEnv* env, int state) } static void -setup_call2_call(Node* node) +tune_call2_call(Node* node) { switch (NODE_TYPE(node)) { case NODE_LIST: case NODE_ALT: do { - setup_call2_call(NODE_CAR(node)); + tune_call2_call(NODE_CAR(node)); } while (IS_NOT_NULL(node = NODE_CDR(node))); break; case NODE_QUANT: - setup_call2_call(NODE_BODY(node)); + tune_call2_call(NODE_BODY(node)); break; case NODE_ANCHOR: if (ANCHOR_HAS_BODY(ANCHOR_(node))) - setup_call2_call(NODE_BODY(node)); + tune_call2_call(NODE_BODY(node)); break; case NODE_BAG: @@ -4198,19 +4592,19 @@ setup_call2_call(Node* node) if (en->type == BAG_MEMORY) { if (! NODE_IS_MARK1(node)) { NODE_STATUS_ADD(node, MARK1); - setup_call2_call(NODE_BODY(node)); + tune_call2_call(NODE_BODY(node)); NODE_STATUS_REMOVE(node, MARK1); } } else if (en->type == BAG_IF_ELSE) { - setup_call2_call(NODE_BODY(node)); + tune_call2_call(NODE_BODY(node)); if (IS_NOT_NULL(en->te.Then)) - setup_call2_call(en->te.Then); + tune_call2_call(en->te.Then); if (IS_NOT_NULL(en->te.Else)) - setup_call2_call(en->te.Else); + tune_call2_call(en->te.Else); } else { - setup_call2_call(NODE_BODY(node)); + tune_call2_call(NODE_BODY(node)); } } break; @@ -4226,7 +4620,7 @@ setup_call2_call(Node* node) NODE_STATUS_ADD(called, CALLED); BAG_(called)->m.entry_count++; - setup_call2_call(called); + tune_call2_call(called); } NODE_STATUS_REMOVE(node, MARK1); } @@ -4238,7 +4632,7 @@ setup_call2_call(Node* node) } static int -setup_call(Node* node, ScanEnv* env, int state) +tune_call(Node* node, ScanEnv* env, int state) { int r; @@ -4246,7 +4640,7 @@ setup_call(Node* node, ScanEnv* env, int state) case NODE_LIST: case NODE_ALT: do { - r = setup_call(NODE_CAR(node), env, state); + r = tune_call(NODE_CAR(node), env, state); } while (r == 0 && IS_NOT_NULL(node = NODE_CDR(node))); break; @@ -4254,12 +4648,12 @@ setup_call(Node* node, ScanEnv* env, int state) if (QUANT_(node)->upper == 0) state |= IN_ZERO_REPEAT; - r = setup_call(NODE_BODY(node), env, state); + r = tune_call(NODE_BODY(node), env, state); break; case NODE_ANCHOR: if (ANCHOR_HAS_BODY(ANCHOR_(node))) - r = setup_call(NODE_BODY(node), env, state); + r = tune_call(NODE_BODY(node), env, state); else r = 0; break; @@ -4273,20 +4667,20 @@ setup_call(Node* node, ScanEnv* env, int state) NODE_STATUS_ADD(node, IN_ZERO_REPEAT); BAG_(node)->m.entry_count--; } - r = setup_call(NODE_BODY(node), env, state); + r = tune_call(NODE_BODY(node), env, state); } else if (en->type == BAG_IF_ELSE) { - r = setup_call(NODE_BODY(node), env, state); + r = tune_call(NODE_BODY(node), env, state); if (r != 0) return r; if (IS_NOT_NULL(en->te.Then)) { - r = setup_call(en->te.Then, env, state); + r = tune_call(en->te.Then, env, state); if (r != 0) return r; } if (IS_NOT_NULL(en->te.Else)) - r = setup_call(en->te.Else, env, state); + r = tune_call(en->te.Else, env, state); } else - r = setup_call(NODE_BODY(node), env, state); + r = tune_call(NODE_BODY(node), env, state); } break; @@ -4296,7 +4690,7 @@ setup_call(Node* node, ScanEnv* env, int state) CALL_(node)->entry_count--; } - r = setup_call_node_call(CALL_(node), env, state); + r = tune_call_node_call(CALL_(node), env, state); break; default: @@ -4308,7 +4702,7 @@ setup_call(Node* node, ScanEnv* env, int state) } static int -setup_call2(Node* node) +tune_call2(Node* node) { int r = 0; @@ -4316,23 +4710,23 @@ setup_call2(Node* node) case NODE_LIST: case NODE_ALT: do { - r = setup_call2(NODE_CAR(node)); + r = tune_call2(NODE_CAR(node)); } while (r == 0 && IS_NOT_NULL(node = NODE_CDR(node))); break; case NODE_QUANT: if (QUANT_(node)->upper != 0) - r = setup_call2(NODE_BODY(node)); + r = tune_call2(NODE_BODY(node)); break; case NODE_ANCHOR: if (ANCHOR_HAS_BODY(ANCHOR_(node))) - r = setup_call2(NODE_BODY(node)); + r = tune_call2(NODE_BODY(node)); break; case NODE_BAG: if (! NODE_IS_IN_ZERO_REPEAT(node)) - r = setup_call2(NODE_BODY(node)); + r = tune_call2(NODE_BODY(node)); { BagNode* en = BAG_(node); @@ -4340,18 +4734,18 @@ setup_call2(Node* node) if (r != 0) return r; if (en->type == BAG_IF_ELSE) { if (IS_NOT_NULL(en->te.Then)) { - r = setup_call2(en->te.Then); + r = tune_call2(en->te.Then); if (r != 0) return r; } if (IS_NOT_NULL(en->te.Else)) - r = setup_call2(en->te.Else); + r = tune_call2(en->te.Else); } } break; case NODE_CALL: if (! NODE_IS_IN_ZERO_REPEAT(node)) { - setup_call2_call(node); + tune_call2_call(node); } break; @@ -4364,7 +4758,7 @@ setup_call2(Node* node) static void -setup_called_state_call(Node* node, int state) +tune_called_state_call(Node* node, int state) { switch (NODE_TYPE(node)) { case NODE_ALT: @@ -4372,7 +4766,7 @@ setup_called_state_call(Node* node, int state) /* fall */ case NODE_LIST: do { - setup_called_state_call(NODE_CAR(node), state); + tune_called_state_call(NODE_CAR(node), state); } while (IS_NOT_NULL(node = NODE_CDR(node))); break; @@ -4385,7 +4779,7 @@ setup_called_state_call(Node* node, int state) if (qn->lower != qn->upper) state |= IN_VAR_REPEAT; - setup_called_state_call(NODE_QUANT_BODY(qn), state); + tune_called_state_call(NODE_QUANT_BODY(qn), state); } break; @@ -4400,7 +4794,7 @@ setup_called_state_call(Node* node, int state) /* fall */ case ANCR_PREC_READ: case ANCR_LOOK_BEHIND: - setup_called_state_call(NODE_ANCHOR_BODY(an), state); + tune_called_state_call(NODE_ANCHOR_BODY(an), state); break; default: break; @@ -4416,31 +4810,33 @@ setup_called_state_call(Node* node, int state) if (NODE_IS_MARK1(node)) { if ((~en->m.called_state & state) != 0) { en->m.called_state |= state; - setup_called_state_call(NODE_BODY(node), state); + tune_called_state_call(NODE_BODY(node), state); } } else { NODE_STATUS_ADD(node, MARK1); en->m.called_state |= state; - setup_called_state_call(NODE_BODY(node), state); + tune_called_state_call(NODE_BODY(node), state); NODE_STATUS_REMOVE(node, MARK1); } } else if (en->type == BAG_IF_ELSE) { + state |= IN_ALT; + tune_called_state_call(NODE_BODY(node), state); if (IS_NOT_NULL(en->te.Then)) { - setup_called_state_call(en->te.Then, state); + tune_called_state_call(en->te.Then, state); } if (IS_NOT_NULL(en->te.Else)) - setup_called_state_call(en->te.Else, state); + tune_called_state_call(en->te.Else, state); } else { - setup_called_state_call(NODE_BODY(node), state); + tune_called_state_call(NODE_BODY(node), state); } } break; case NODE_CALL: - setup_called_state_call(NODE_BODY(node), state); + tune_called_state_call(NODE_BODY(node), state); break; default: @@ -4449,7 +4845,7 @@ setup_called_state_call(Node* node, int state) } static void -setup_called_state(Node* node, int state) +tune_called_state(Node* node, int state) { switch (NODE_TYPE(node)) { case NODE_ALT: @@ -4457,13 +4853,13 @@ setup_called_state(Node* node, int state) /* fall */ case NODE_LIST: do { - setup_called_state(NODE_CAR(node), state); + tune_called_state(NODE_CAR(node), state); } while (IS_NOT_NULL(node = NODE_CDR(node))); break; #ifdef USE_CALL case NODE_CALL: - setup_called_state_call(node, state); + tune_called_state_call(node, state); break; #endif @@ -4480,14 +4876,15 @@ setup_called_state(Node* node, int state) /* fall */ case BAG_OPTION: case BAG_STOP_BACKTRACK: - setup_called_state(NODE_BODY(node), state); + tune_called_state(NODE_BODY(node), state); break; case BAG_IF_ELSE: - setup_called_state(NODE_BODY(node), state); + state |= IN_ALT; + tune_called_state(NODE_BODY(node), state); if (IS_NOT_NULL(en->te.Then)) - setup_called_state(en->te.Then, state); + tune_called_state(en->te.Then, state); if (IS_NOT_NULL(en->te.Else)) - setup_called_state(en->te.Else, state); + tune_called_state(en->te.Else, state); break; } } @@ -4502,7 +4899,7 @@ setup_called_state(Node* node, int state) if (qn->lower != qn->upper) state |= IN_VAR_REPEAT; - setup_called_state(NODE_QUANT_BODY(qn), state); + tune_called_state(NODE_QUANT_BODY(qn), state); } break; @@ -4517,7 +4914,7 @@ setup_called_state(Node* node, int state) /* fall */ case ANCR_PREC_READ: case ANCR_LOOK_BEHIND: - setup_called_state(NODE_ANCHOR_BODY(an), state); + tune_called_state(NODE_ANCHOR_BODY(an), state); break; default: break; @@ -4538,13 +4935,13 @@ setup_called_state(Node* node, int state) #endif /* USE_CALL */ -static int setup_tree(Node* node, regex_t* reg, int state, ScanEnv* env); +static int tune_tree(Node* node, regex_t* reg, int state, ScanEnv* env); #ifdef __GNUC__ __inline #endif static int -setup_anchor(Node* node, regex_t* reg, int state, ScanEnv* env) +tune_anchor(Node* node, regex_t* reg, int state, ScanEnv* env) { /* allowed node types in look-behind */ #define ALLOWED_TYPE_IN_LB \ @@ -4572,10 +4969,10 @@ setup_anchor(Node* node, regex_t* reg, int state, ScanEnv* env) switch (an->type) { case ANCR_PREC_READ: - r = setup_tree(NODE_ANCHOR_BODY(an), reg, state, env); + r = tune_tree(NODE_ANCHOR_BODY(an), reg, state, env); break; case ANCR_PREC_READ_NOT: - r = setup_tree(NODE_ANCHOR_BODY(an), reg, (state | IN_NOT), env); + r = tune_tree(NODE_ANCHOR_BODY(an), reg, (state | IN_NOT), env); break; case ANCR_LOOK_BEHIND: @@ -4584,9 +4981,9 @@ setup_anchor(Node* node, regex_t* reg, int state, ScanEnv* env) ALLOWED_BAG_IN_LB, ALLOWED_ANCHOR_IN_LB); if (r < 0) return r; if (r > 0) return ONIGERR_INVALID_LOOK_BEHIND_PATTERN; - r = setup_tree(NODE_ANCHOR_BODY(an), reg, (state|IN_LOOK_BEHIND), env); + r = tune_tree(NODE_ANCHOR_BODY(an), reg, (state|IN_LOOK_BEHIND), env); if (r != 0) return r; - r = setup_look_behind(node, reg, env); + r = tune_look_behind(node, reg, env); } break; @@ -4596,10 +4993,10 @@ setup_anchor(Node* node, regex_t* reg, int state, ScanEnv* env) ALLOWED_BAG_IN_LB_NOT, ALLOWED_ANCHOR_IN_LB_NOT); if (r < 0) return r; if (r > 0) return ONIGERR_INVALID_LOOK_BEHIND_PATTERN; - r = setup_tree(NODE_ANCHOR_BODY(an), reg, (state|IN_NOT|IN_LOOK_BEHIND), - env); + r = tune_tree(NODE_ANCHOR_BODY(an), reg, (state|IN_NOT|IN_LOOK_BEHIND), + env); if (r != 0) return r; - r = setup_look_behind(node, reg, env); + r = tune_look_behind(node, reg, env); } break; @@ -4615,7 +5012,7 @@ setup_anchor(Node* node, regex_t* reg, int state, ScanEnv* env) __inline #endif static int -setup_quant(Node* node, regex_t* reg, int state, ScanEnv* env) +tune_quant(Node* node, regex_t* reg, int state, ScanEnv* env) { int r; OnigLen d; @@ -4634,12 +5031,6 @@ setup_quant(Node* node, regex_t* reg, int state, ScanEnv* env) if (d == 0) { #ifdef USE_STUBBORN_CHECK_CAPTURES_IN_EMPTY_REPEAT qn->emptiness = quantifiers_memory_node_info(body); - if (qn->emptiness == BODY_IS_EMPTY_POSSIBILITY_REC) { - if (NODE_TYPE(body) == NODE_BAG && - BAG_(body)->type == BAG_MEMORY) { - MEM_STATUS_ON(env->bt_mem_end, BAG_(body)->m.regnum); - } - } #else qn->emptiness = BODY_IS_EMPTY_POSSIBILITY; #endif @@ -4651,7 +5042,7 @@ setup_quant(Node* node, regex_t* reg, int state, ScanEnv* env) if (qn->lower != qn->upper) state |= IN_VAR_REPEAT; - r = setup_tree(body, reg, state, env); + r = tune_tree(body, reg, state, env); if (r != 0) return r; /* expand string */ @@ -4660,13 +5051,12 @@ setup_quant(Node* node, regex_t* reg, int state, ScanEnv* env) if (!IS_INFINITE_REPEAT(qn->lower) && qn->lower == qn->upper && qn->lower > 1 && qn->lower <= EXPAND_STRING_MAX_LENGTH) { int len = NODE_STRING_LEN(body); - StrNode* sn = STR_(body); if (len * qn->lower <= EXPAND_STRING_MAX_LENGTH) { int i, n = qn->lower; - onig_node_conv_to_str_node(node, STR_(body)->flag); + node_conv_to_str_node(node, STR_(body)->flag); for (i = 0; i < n; i++) { - r = onig_node_str_cat(node, sn->s, sn->end); + r = node_str_node_cat(node, body); if (r != 0) return r; } onig_node_free(body); @@ -4691,7 +5081,7 @@ setup_quant(Node* node, regex_t* reg, int state, ScanEnv* env) return r; } -/* setup_tree does the following work. +/* tune_tree does the following work. 1. check empty loop. (set qn->emptiness) 2. expand ignore-case in char class. 3. set memory status bit flags. (reg->mem_stats) @@ -4700,7 +5090,7 @@ setup_quant(Node* node, regex_t* reg, int state, ScanEnv* env) 6. expand repeated string. */ static int -setup_tree(Node* node, regex_t* reg, int state, ScanEnv* env) +tune_tree(Node* node, regex_t* reg, int state, ScanEnv* env) { int r = 0; @@ -4709,9 +5099,9 @@ setup_tree(Node* node, regex_t* reg, int state, ScanEnv* env) { Node* prev = NULL_NODE; do { - r = setup_tree(NODE_CAR(node), reg, state, env); + r = tune_tree(NODE_CAR(node), reg, state, env); if (IS_NOT_NULL(prev) && r == 0) { - r = next_setup(prev, NODE_CAR(node), reg); + r = tune_next(prev, NODE_CAR(node), reg); } prev = NODE_CAR(node); } while (r == 0 && IS_NOT_NULL(node = NODE_CDR(node))); @@ -4720,13 +5110,13 @@ setup_tree(Node* node, regex_t* reg, int state, ScanEnv* env) case NODE_ALT: do { - r = setup_tree(NODE_CAR(node), reg, (state | IN_ALT), env); + r = tune_tree(NODE_CAR(node), reg, (state | IN_ALT), env); } while (r == 0 && IS_NOT_NULL(node = NODE_CDR(node))); break; case NODE_STRING: - if (IS_IGNORECASE(reg->options) && !NODE_STRING_IS_RAW(node)) { - r = expand_case_fold_string(node, reg, state); + if (IS_IGNORECASE(reg->options) && !NODE_STRING_IS_CRUDE(node)) { + r = unravel_case_fold_string(node, reg, state); } break; @@ -4739,12 +5129,18 @@ setup_tree(Node* node, regex_t* reg, int state, ScanEnv* env) for (i = 0; i < br->back_num; i++) { if (p[i] > env->num_mem) return ONIGERR_INVALID_BACKREF; MEM_STATUS_ON(env->backrefed_mem, p[i]); - MEM_STATUS_ON(env->bt_mem_start, p[i]); +#if 0 #ifdef USE_BACKREF_WITH_LEVEL if (NODE_IS_NEST_LEVEL(node)) { - MEM_STATUS_ON(env->bt_mem_end, p[i]); + MEM_STATUS_ON(env->backtrack_mem, p[i]); } #endif +#else + /* More precisely, it should be checked whether alt/repeat exists before + the subject capture node, and then this backreference position + exists before (or in) the capture node. */ + MEM_STATUS_ON(env->backtrack_mem, p[i]); +#endif } } break; @@ -4758,7 +5154,7 @@ setup_tree(Node* node, regex_t* reg, int state, ScanEnv* env) { OnigOptionType options = reg->options; reg->options = BAG_(node)->o.options; - r = setup_tree(NODE_BODY(node), reg, state, env); + r = tune_tree(NODE_BODY(node), reg, state, env); reg->options = options; } break; @@ -4770,15 +5166,15 @@ setup_tree(Node* node, regex_t* reg, int state, ScanEnv* env) if ((state & (IN_ALT | IN_NOT | IN_VAR_REPEAT | IN_MULTI_ENTRY)) != 0 || NODE_IS_RECURSION(node)) { - MEM_STATUS_ON(env->bt_mem_start, en->m.regnum); + MEM_STATUS_ON(env->backtrack_mem, en->m.regnum); } - r = setup_tree(NODE_BODY(node), reg, state, env); + r = tune_tree(NODE_BODY(node), reg, state, env); break; case BAG_STOP_BACKTRACK: { Node* target = NODE_BODY(node); - r = setup_tree(target, reg, state, env); + r = tune_tree(target, reg, state, env); if (NODE_TYPE(target) == NODE_QUANT) { QuantNode* tqn = QUANT_(target); if (IS_INFINITE_REPEAT(tqn->upper) && tqn->lower <= 1 && @@ -4791,25 +5187,25 @@ setup_tree(Node* node, regex_t* reg, int state, ScanEnv* env) break; case BAG_IF_ELSE: - r = setup_tree(NODE_BODY(node), reg, (state | IN_ALT), env); + r = tune_tree(NODE_BODY(node), reg, (state | IN_ALT), env); if (r != 0) return r; if (IS_NOT_NULL(en->te.Then)) { - r = setup_tree(en->te.Then, reg, (state | IN_ALT), env); + r = tune_tree(en->te.Then, reg, (state | IN_ALT), env); if (r != 0) return r; } if (IS_NOT_NULL(en->te.Else)) - r = setup_tree(en->te.Else, reg, (state | IN_ALT), env); + r = tune_tree(en->te.Else, reg, (state | IN_ALT), env); break; } } break; case NODE_QUANT: - r = setup_quant(node, reg, state, env); + r = tune_quant(node, reg, state, env); break; case NODE_ANCHOR: - r = setup_anchor(node, reg, state, env); + r = tune_anchor(node, reg, state, env); break; #ifdef USE_CALL @@ -4908,7 +5304,7 @@ typedef struct { } MinMax; typedef struct { - MinMax mmd; + MinMax mm; OnigEncoding enc; OnigOptionType options; OnigCaseFoldType case_fold_flag; @@ -4921,17 +5317,16 @@ typedef struct { } OptAnc; typedef struct { - MinMax mmd; /* position */ + MinMax mm; /* position */ OptAnc anc; int reach_end; int case_fold; - int good_case_fold; int len; UChar s[OPT_EXACT_MAXLEN]; } OptStr; typedef struct { - MinMax mmd; /* position */ + MinMax mm; /* position */ OptAnc anc; int value; /* weighted value */ UChar map[CHAR_MAP_SIZE]; @@ -5148,11 +5543,10 @@ is_full_opt_exact(OptStr* e) static void clear_opt_exact(OptStr* e) { - clear_mml(&e->mmd); + clear_mml(&e->mm); clear_opt_anc_info(&e->anc); e->reach_end = 0; e->case_fold = 0; - e->good_case_fold = 0; e->len = 0; e->s[0] = '\0'; } @@ -5176,11 +5570,6 @@ concat_opt_exact(OptStr* to, OptStr* add, OnigEncoding enc) to->case_fold = 1; } - else { - if (to->good_case_fold != 0) { - if (add->good_case_fold == 0) return 0; - } - } } r = 0; @@ -5235,7 +5624,7 @@ alt_merge_opt_exact(OptStr* to, OptStr* add, OptEnv* env) return ; } - if (! is_equal_mml(&to->mmd, &add->mmd)) { + if (! is_equal_mml(&to->mm, &add->mm)) { clear_opt_exact(to); return ; } @@ -5257,8 +5646,6 @@ alt_merge_opt_exact(OptStr* to, OptStr* add, OptEnv* env) to->len = i; if (add->case_fold != 0) to->case_fold = 1; - if (add->good_case_fold == 0) - to->good_case_fold = 0; alt_merge_opt_anc_info(&to->anc, &add->anc); if (! to->reach_end) to->anc.right = 0; @@ -5291,10 +5678,7 @@ select_opt_exact(OnigEncoding enc, OptStr* now, OptStr* alt) if (now->case_fold == 0) vn *= 2; if (alt->case_fold == 0) va *= 2; - if (now->good_case_fold != 0) vn *= 4; - if (alt->good_case_fold != 0) va *= 4; - - if (comp_distance_value(&now->mmd, &alt->mmd, vn, va) > 0) + if (comp_distance_value(&now->mm, &alt->mm, vn, va) > 0) copy_opt_exact(now, alt); } @@ -5378,7 +5762,7 @@ select_opt_map(OptMap* now, OptMap* alt) vn = z / now->value; va = z / alt->value; - if (comp_distance_value(&now->mmd, &alt->mmd, vn, va) > 0) + if (comp_distance_value(&now->mm, &alt->mm, vn, va) > 0) copy_opt_map(now, alt); } @@ -5392,17 +5776,14 @@ comp_opt_exact_or_map(OptStr* e, OptMap* m) if (m->value <= 0) return -1; if (e->case_fold != 0) { - if (e->good_case_fold != 0) - case_value = 2; - else - case_value = 1; + case_value = 1; } else case_value = 3; ae = COMP_EM_BASE * e->len * case_value; am = COMP_EM_BASE * 5 * 2 / m->value; - return comp_distance_value(&e->mmd, &m->mmd, ae, am); + return comp_distance_value(&e->mm, &m->mm, ae, am); } static void @@ -5410,14 +5791,14 @@ alt_merge_opt_map(OnigEncoding enc, OptMap* to, OptMap* add) { int i, val; - /* if (! is_equal_mml(&to->mmd, &add->mmd)) return ; */ + /* if (! is_equal_mml(&to->mm, &add->mm)) return ; */ if (to->value == 0) return ; - if (add->value == 0 || to->mmd.max < add->mmd.min) { + if (add->value == 0 || to->mm.max < add->mm.min) { clear_opt_map(to); return ; } - alt_merge_mml(&to->mmd, &add->mmd); + alt_merge_mml(&to->mm, &add->mm); val = 0; for (i = 0; i < CHAR_MAP_SIZE; i++) { @@ -5435,9 +5816,9 @@ alt_merge_opt_map(OnigEncoding enc, OptMap* to, OptMap* add) static void set_bound_node_opt_info(OptNode* opt, MinMax* plen) { - copy_mml(&(opt->sb.mmd), plen); - copy_mml(&(opt->spr.mmd), plen); - copy_mml(&(opt->map.mmd), plen); + copy_mml(&(opt->sb.mm), plen); + copy_mml(&(opt->spr.mm), plen); + copy_mml(&(opt->map.mm), plen); } static void @@ -5472,7 +5853,7 @@ concat_left_node_opt_info(OnigEncoding enc, OptNode* to, OptNode* add) } if (add->map.value > 0 && to->len.max == 0) { - if (add->map.mmd.max == 0) + if (add->map.mm.max == 0) add->map.anc.left |= to->anc.left; } @@ -5497,10 +5878,7 @@ concat_left_node_opt_info(OnigEncoding enc, OptNode* to, OptNode* add) if (to->spr.len > 0) { if (add->len.max > 0) { - if (to->spr.len > (int )add->len.max) - to->spr.len = add->len.max; - - if (to->spr.mmd.max == 0) + if (to->spr.mm.max == 0) select_opt_exact(enc, &to->sb, &to->spr); else select_opt_exact(enc, &to->sm, &to->spr); @@ -5540,7 +5918,7 @@ optimize_nodes(Node* node, OptNode* opt, OptEnv* env) r = 0; enc = env->enc; clear_node_opt_info(opt); - set_bound_node_opt_info(opt, &env->mmd); + set_bound_node_opt_info(opt, &env->mm); switch (NODE_TYPE(node)) { case NODE_LIST: @@ -5552,7 +5930,7 @@ optimize_nodes(Node* node, OptNode* opt, OptEnv* env) do { r = optimize_nodes(NODE_CAR(nd), &xo, &nenv); if (r == 0) { - add_mml(&nenv.mmd, &xo.len); + add_mml(&nenv.mm, &xo.len); concat_left_node_opt_info(enc, opt, &xo); } } while (r == 0 && IS_NOT_NULL(nd = NODE_CDR(nd))); @@ -5577,9 +5955,8 @@ optimize_nodes(Node* node, OptNode* opt, OptEnv* env) { StrNode* sn = STR_(node); int slen = (int )(sn->end - sn->s); - /* int is_raw = NODE_STRING_IS_RAW(node); */ - if (! NODE_STRING_IS_AMBIG(node)) { + if (! NODE_STRING_IS_CASE_FOLD_MATCH(node)) { concat_opt_exact_str(&opt->sb, sn->s, sn->end, enc); if (slen > 0) { add_char_opt_map(&opt->map, *(sn->s), enc); @@ -5587,28 +5964,20 @@ optimize_nodes(Node* node, OptNode* opt, OptEnv* env) set_mml(&opt->len, slen, slen); } else { - int max; + int max, min; - if (NODE_STRING_IS_DONT_GET_OPT_INFO(node)) { - int n = onigenc_strlen(enc, sn->s, sn->end); - max = ONIGENC_MBC_MAXLEN_DIST(enc) * n; - } - else { - concat_opt_exact_str(&opt->sb, sn->s, sn->end, enc); - opt->sb.case_fold = 1; - if (NODE_STRING_IS_GOOD_AMBIG(node)) - opt->sb.good_case_fold = 1; - - if (slen > 0) { - r = add_char_amb_opt_map(&opt->map, sn->s, sn->end, - enc, env->case_fold_flag); - if (r != 0) break; - } + concat_opt_exact_str(&opt->sb, sn->s, sn->end, enc); + opt->sb.case_fold = 1; - max = slen; + if (slen > 0) { + r = add_char_amb_opt_map(&opt->map, sn->s, sn->end, + enc, env->case_fold_flag); + if (r != 0) break; } - set_mml(&opt->len, slen, max); + max = slen; + min = sn->case_min_len * ONIGENC_MBC_MINLEN(enc); + set_mml(&opt->len, min, max); } } break; @@ -5618,7 +5987,7 @@ optimize_nodes(Node* node, OptNode* opt, OptEnv* env) int z; CClassNode* cc = CCLASS_(node); - /* no need to check ignore case. (set in setup_tree()) */ + /* no need to check ignore case. (set in tune_tree()) */ if (IS_NOT_NULL(cc->mbuf) || IS_NCCLASS_NOT(cc)) { OnigLen min = ONIGENC_MBC_MINLEN(enc); @@ -5728,11 +6097,11 @@ optimize_nodes(Node* node, OptNode* opt, OptEnv* env) break; } backs = BACKREFS_P(br); - min = tree_min_len(mem_env[backs[0]].node, env->scan_env); - max = tree_max_len(mem_env[backs[0]].node, env->scan_env); + min = tree_min_len(mem_env[backs[0]].mem_node, env->scan_env); + max = tree_max_len(mem_env[backs[0]].mem_node, env->scan_env); for (i = 1; i < br->back_num; i++) { - tmin = tree_min_len(mem_env[backs[i]].node, env->scan_env); - tmax = tree_max_len(mem_env[backs[i]].node, env->scan_env); + tmin = tree_min_len(mem_env[backs[i]].mem_node, env->scan_env); + tmax = tree_max_len(mem_env[backs[i]].mem_node, env->scan_env); if (min > tmin) min = tmin; if (max < tmax) max = tmax; } @@ -5782,7 +6151,7 @@ optimize_nodes(Node* node, OptNode* opt, OptEnv* env) } if (IS_INFINITE_REPEAT(qn->upper)) { - if (env->mmd.max == 0 && + if (env->mm.max == 0 && NODE_IS_ANYCHAR(NODE_BODY(node)) && qn->greedy != 0) { if (IS_MULTILINE(CTYPE_OPTION(NODE_QUANT_BODY(qn), env))) add_opt_anc_info(&opt->anc, ANCR_ANYCHAR_INF_ML); @@ -5850,7 +6219,7 @@ optimize_nodes(Node* node, OptNode* opt, OptEnv* env) copy_opt_env(&nenv, env); r = optimize_nodes(NODE_BAG_BODY(en), &xo, &nenv); if (r == 0) { - add_mml(&nenv.mmd, &xo.len); + add_mml(&nenv.mm, &xo.len); concat_left_node_opt_info(enc, opt, &xo); if (IS_NOT_NULL(en->te.Then)) { r = optimize_nodes(en->te.Then, &xo, &nenv); @@ -5899,15 +6268,6 @@ set_optimize_exact(regex_t* reg, OptStr* e) if (e->case_fold) { reg->optimize = OPTIMIZE_STR_CASE_FOLD; - if (e->good_case_fold != 0) { - if (e->len >= 2) { - r = set_sunday_quick_search_or_bmh_skip_table(reg, 1, - reg->exact, reg->exact_end, - reg->map, &(reg->map_offset)); - if (r != 0) return r; - reg->optimize = OPTIMIZE_STR_CASE_FOLD_FAST; - } - } } else { int allow_reverse; @@ -5930,11 +6290,17 @@ set_optimize_exact(regex_t* reg, OptStr* e) } } - reg->dmin = e->mmd.min; - reg->dmax = e->mmd.max; + reg->dist_min = e->mm.min; + reg->dist_max = e->mm.max; - if (reg->dmin != INFINITE_LEN) { - reg->threshold_len = reg->dmin + (int )(reg->exact_end - reg->exact); + if (reg->dist_min != INFINITE_LEN) { + int n; + if (e->case_fold != 0) + n = 1; + else + n = (int )(reg->exact_end - reg->exact); + + reg->threshold_len = reg->dist_min + n; } return 0; @@ -5949,11 +6315,11 @@ set_optimize_map(regex_t* reg, OptMap* m) reg->map[i] = m->map[i]; reg->optimize = OPTIMIZE_MAP; - reg->dmin = m->mmd.min; - reg->dmax = m->mmd.max; + reg->dist_min = m->mm.min; + reg->dist_max = m->mm.max; - if (reg->dmin != INFINITE_LEN) { - reg->threshold_len = reg->dmin + 1; + if (reg->dist_min != INFINITE_LEN) { + reg->threshold_len = reg->dist_min + 1; } } @@ -5979,7 +6345,7 @@ set_optimize_info_from_tree(Node* node, regex_t* reg, ScanEnv* scan_env) env.options = reg->options; env.case_fold_flag = reg->case_fold_flag; env.scan_env = scan_env; - clear_mml(&env.mmd); + clear_mml(&env.mm); r = optimize_nodes(node, &opt, &env); if (r != 0) return r; @@ -5995,8 +6361,8 @@ set_optimize_info_from_tree(Node* node, regex_t* reg, ScanEnv* scan_env) ANCR_PREC_READ_NOT); if (reg->anchor & (ANCR_END_BUF | ANCR_SEMI_END_BUF)) { - reg->anchor_dmin = opt.len.min; - reg->anchor_dmax = opt.len.max; + reg->anc_dist_min = opt.len.min; + reg->anc_dist_max = opt.len.max; } if (opt.sb.len > 0 || opt.sm.len > 0) { @@ -6031,8 +6397,8 @@ clear_optimize_info(regex_t* reg) { reg->optimize = OPTIMIZE_NONE; reg->anchor = 0; - reg->anchor_dmin = 0; - reg->anchor_dmax = 0; + reg->anc_dist_min = 0; + reg->anc_dist_max = 0; reg->sub_anchor = 0; reg->exact_end = (UChar* )NULL; reg->map_offset = 0; @@ -6151,12 +6517,12 @@ print_optimize_info(FILE* f, regex_t* reg) { static const char* on[] = { "NONE", "STR", "STR_FAST", "STR_FAST_STEP_FORWARD", - "STR_CASE_FOLD_FAST", "STR_CASE_FOLD", "MAP" }; + "STR_CASE_FOLD", "MAP" }; fprintf(f, "optimize: %s\n", on[reg->optimize]); fprintf(f, " anchor: "); print_anchor(f, reg->anchor); if ((reg->anchor & ANCR_END_BUF_MASK) != 0) - print_distance_range(f, reg->anchor_dmin, reg->anchor_dmax); + print_distance_range(f, reg->anc_dist_min, reg->anc_dist_max); fprintf(f, "\n"); if (reg->optimize) { @@ -6304,7 +6670,7 @@ onig_compile(regex_t* reg, const UChar* pattern, const UChar* pattern_end, Node* root; ScanEnv scan_env; #ifdef USE_CALL - UnsetAddrList uslist; + UnsetAddrList uslist = {0}; #endif root = 0; @@ -6328,13 +6694,17 @@ onig_compile(regex_t* reg, const UChar* pattern, const UChar* pattern_end, reg->string_pool_end = 0; reg->num_mem = 0; reg->num_repeat = 0; - reg->num_null_check = 0; + reg->num_empty_check = 0; reg->repeat_range_alloc = 0; - reg->repeat_range = (OnigRepeatRange* )NULL; + reg->repeat_range = (RepeatRange* )NULL; + reg->empty_status_mem = 0; r = onig_parse_tree(&root, pattern, pattern_end, reg, &scan_env); if (r != 0) goto err; + r = reduce_string_list(root); + if (r != 0) goto err; + /* mixed use named group and no-named group */ if (scan_env.num_named > 0 && IS_SYNTAX_BV(scan_env.syntax, ONIG_SYN_CAPTURE_ONLY_NAMED_GROUP) && @@ -6355,38 +6725,65 @@ onig_compile(regex_t* reg, const UChar* pattern, const UChar* pattern_end, r = unset_addr_list_init(&uslist, scan_env.num_call); if (r != 0) goto err; scan_env.unset_addr_list = &uslist; - r = setup_call(root, &scan_env, 0); + r = tune_call(root, &scan_env, 0); if (r != 0) goto err_unset; - r = setup_call2(root); + r = tune_call2(root); if (r != 0) goto err_unset; r = recursive_call_check_trav(root, &scan_env, 0); if (r < 0) goto err_unset; r = infinite_recursive_call_check_trav(root, &scan_env); if (r != 0) goto err_unset; - setup_called_state(root, 0); + tune_called_state(root, 0); } reg->num_call = scan_env.num_call; #endif - r = setup_tree(root, reg, 0, &scan_env); +#ifdef ONIG_DEBUG_PARSE + fprintf(stderr, "MAX PARSE DEPTH: %d\n", scan_env.max_parse_depth); + fprintf(stderr, "TREE (parsed)\n"); + print_tree(stderr, root); + fprintf(stderr, "\n"); +#endif + + r = tune_tree(root, reg, 0, &scan_env); if (r != 0) goto err_unset; + if (scan_env.backref_num != 0) { + set_parent_node_trav(root, NULL_NODE); + r = set_empty_repeat_node_trav(root, NULL_NODE, &scan_env); + if (r != 0) goto err_unset; + set_empty_status_check_trav(root, &scan_env); + } + #ifdef ONIG_DEBUG_PARSE + fprintf(stderr, "TREE (after tune)\n"); print_tree(stderr, root); + fprintf(stderr, "\n"); #endif - reg->capture_history = scan_env.capture_history; - reg->bt_mem_start = scan_env.bt_mem_start; - reg->bt_mem_start |= reg->capture_history; - if (IS_FIND_CONDITION(reg->options)) - MEM_STATUS_ON_ALL(reg->bt_mem_end); + reg->capture_history = scan_env.cap_history; + reg->push_mem_start = scan_env.backtrack_mem | scan_env.cap_history; + +#ifdef USE_CALLOUT + if (IS_NOT_NULL(reg->extp) && reg->extp->callout_num != 0) { + reg->push_mem_end = reg->push_mem_start; + } else { - reg->bt_mem_end = scan_env.bt_mem_end; - reg->bt_mem_end |= reg->capture_history; + if (MEM_STATUS_IS_ALL_ON(reg->push_mem_start)) + reg->push_mem_end = scan_env.backrefed_mem | scan_env.cap_history; + else + reg->push_mem_end = reg->push_mem_start & + (scan_env.backrefed_mem | scan_env.cap_history); } - reg->bt_mem_start |= reg->bt_mem_end; +#else + if (MEM_STATUS_IS_ALL_ON(reg->push_mem_start)) + reg->push_mem_end = scan_env.backrefed_mem | scan_env.cap_history; + else + reg->push_mem_end = reg->push_mem_start & + (scan_env.backrefed_mem | scan_env.cap_history); +#endif clear_optimize_info(reg); #ifndef ONIG_DONT_OPTIMIZE @@ -6420,14 +6817,20 @@ onig_compile(regex_t* reg, const UChar* pattern, const UChar* pattern_end, } #endif - if ((reg->num_repeat != 0) || (reg->bt_mem_end != 0) + set_addr_in_repeat_range(reg); + + if ((reg->push_mem_end != 0) +#ifdef USE_REPEAT_AND_EMPTY_CHECK_LOCAL_VAR + || (reg->num_repeat != 0) + || (reg->num_empty_check != 0) +#endif #ifdef USE_CALLOUT || (IS_NOT_NULL(reg->extp) && reg->extp->callout_num != 0) #endif ) reg->stack_pop_level = STACK_POP_LEVEL_ALL; else { - if (reg->bt_mem_start != 0) + if (reg->push_mem_start != 0) reg->stack_pop_level = STACK_POP_LEVEL_MEM_START; else reg->stack_pop_level = STACK_POP_LEVEL_FREE; @@ -6560,11 +6963,14 @@ onig_new(regex_t** reg, const UChar* pattern, const UChar* pattern_end, if (IS_NULL(*reg)) return ONIGERR_MEMORY; r = onig_reg_init(*reg, option, ONIGENC_CASE_FOLD_DEFAULT, enc, syntax); - if (r != 0) goto err; + if (r != 0) { + xfree(*reg); + *reg = NULL; + return r; + } r = onig_compile(*reg, pattern, pattern_end, einfo); if (r != 0) { - err: onig_free(*reg); *reg = NULL; } @@ -6709,12 +7115,14 @@ onig_is_code_in_cc(OnigEncoding enc, OnigCodePoint code, CClassNode* cc) #ifdef ONIG_DEBUG_PARSE +#ifdef USE_CALL static void p_string(FILE* f, int len, UChar* s) { fputs(":", f); while (len-- > 0) { fputc(*s++, f); } } +#endif static void Indent(FILE* f, int indent) @@ -6734,7 +7142,7 @@ print_indent_tree(FILE* f, Node* node, int indent) Indent(f, indent); if (IS_NULL(node)) { fprintf(f, "ERROR: null node!!!\n"); - exit (0); + exit(0); } type = NODE_TYPE(node); @@ -6758,28 +7166,22 @@ print_indent_tree(FILE* f, Node* node, int indent) case NODE_STRING: { + char* str; char* mode; - char* dont; - char* good; - if (NODE_STRING_IS_RAW(node)) - mode = "-raw"; - else if (NODE_STRING_IS_AMBIG(node)) - mode = "-ambig"; + if (NODE_STRING_IS_CRUDE(node)) + mode = "-crude"; + else if (NODE_STRING_IS_CASE_FOLD_MATCH(node)) + mode = "-case_fold_match"; else mode = ""; - if (NODE_STRING_IS_GOOD_AMBIG(node)) - good = "-good"; + if (STR_(node)->s == STR_(node)->end) + str = "empty-string"; else - good = ""; + str = "string"; - if (NODE_STRING_IS_DONT_GET_OPT_INFO(node)) - dont = " (dont-opt)"; - else - dont = ""; - - fprintf(f, "<string%s%s%s:%p>", mode, good, dont, node); + fprintf(f, "<%s%s:%p>", str, mode, node); for (p = STR_(node)->s; p < STR_(node)->end; p++) { if (*p >= 0x20 && *p < 0x7f) fputc(*p, f); @@ -6901,6 +7303,34 @@ print_indent_tree(FILE* f, Node* node, int indent) case NODE_BAG: fprintf(f, "<bag:%p> ", node); + if (BAG_(node)->type == BAG_IF_ELSE) { + Node* Then; + Node* Else; + BagNode* bn; + + bn = BAG_(node); + fprintf(f, "if-else\n"); + print_indent_tree(f, NODE_BODY(node), indent + add); + + Then = bn->te.Then; + Else = bn->te.Else; + if (IS_NULL(Then)) { + Indent(f, indent + add); + fprintf(f, "THEN empty\n"); + } + else + print_indent_tree(f, Then, indent + add); + + if (IS_NULL(Else)) { + Indent(f, indent + add); + fprintf(f, "ELSE empty\n"); + } + else + print_indent_tree(f, Else, indent + add); + + break; + } + switch (BAG_(node)->type) { case BAG_OPTION: fprintf(f, "option:%d", BAG_(node)->o.options); @@ -6911,8 +7341,7 @@ print_indent_tree(FILE* f, Node* node, int indent) case BAG_STOP_BACKTRACK: fprintf(f, "stop-bt"); break; - case BAG_IF_ELSE: - fprintf(f, "if-else"); + default: break; } fprintf(f, "\n"); diff --git a/src/regenc.c b/src/regenc.c index 9fab721..16ac313 100644 --- a/src/regenc.c +++ b/src/regenc.c @@ -2,7 +2,7 @@ regenc.c - Oniguruma (regular expression library) **********************************************************************/ /*- - * Copyright (c) 2002-2019 K.Kosako <sndgk393 AT ybb DOT ne DOT jp> + * Copyright (c) 2002-2019 K.Kosako * All rights reserved. * * Redistribution and use in source and binary forms, with or without @@ -182,7 +182,8 @@ onigenc_get_right_adjust_char_head_with_prev(OnigEncoding enc, p += enclen(enc, p); } else { - if (prev) *prev = (const UChar* )NULL; /* Sorry */ + if (prev) + *prev = onigenc_get_prev_char_head(enc, start, p); } return p; } @@ -208,20 +209,6 @@ onigenc_step_back(OnigEncoding enc, const UChar* start, const UChar* s, int n) return (UChar* )s; } -#if 0 -extern int -onigenc_mbc_enc_len_end(OnigEncoding enc, const UChar* p, const UChar* end) -{ - int len; - int n; - - len = ONIGENC_MBC_ENC_LEN(enc, p); - n = (int )(end - p); - - return (n < len ? n : len); -} -#endif - extern UChar* onigenc_step(OnigEncoding enc, const UChar* p, const UChar* end, int n) { @@ -705,18 +692,6 @@ onigenc_ascii_mbc_case_fold(OnigCaseFoldType flag ARG_UNUSED, const UChar** p, return 1; /* return byte length of converted char to lower */ } -#if 0 -extern int -onigenc_ascii_is_mbc_ambiguous(OnigCaseFoldType flag, - const UChar** pp, const UChar* end) -{ - const UChar* p = *pp; - - (*pp)++; - return ONIGENC_IS_ASCII_CODE_CASE_AMBIG(*p); -} -#endif - extern int onigenc_single_byte_mbc_enc_len(const UChar* p ARG_UNUSED) { @@ -833,41 +808,6 @@ onigenc_mbn_mbc_case_fold(OnigEncoding enc, OnigCaseFoldType flag ARG_UNUSED, } } -#if 0 -extern int -onigenc_mbn_is_mbc_ambiguous(OnigEncoding enc, OnigCaseFoldType flag, - const UChar** pp, const UChar* end) -{ - const UChar* p = *pp; - - if (ONIGENC_IS_MBC_ASCII(p)) { - (*pp)++; - return ONIGENC_IS_ASCII_CODE_CASE_AMBIG(*p); - } - - (*pp) += enclen(enc, p); - return FALSE; -} -#endif - -extern int -onigenc_mb2_code_to_mbclen(OnigCodePoint code) -{ - if ((code & (~0xffff)) != 0) return ONIGERR_INVALID_CODE_POINT_VALUE; - - if ((code & 0xff00) != 0) return 2; - else return 1; -} - -extern int -onigenc_mb4_code_to_mbclen(OnigCodePoint code) -{ - if ((code & 0xff000000) != 0) return 4; - else if ((code & 0xff0000) != 0) return 3; - else if ((code & 0xff00) != 0) return 2; - else return 1; -} - extern int onigenc_mb2_code_to_mbc(OnigEncoding enc, OnigCodePoint code, UChar *buf) { diff --git a/src/regenc.h b/src/regenc.h index bd2819e..db35841 100644 --- a/src/regenc.h +++ b/src/regenc.h @@ -4,7 +4,7 @@ regenc.h - Oniguruma (regular expression library) **********************************************************************/ /*- - * Copyright (c) 2002-2019 K.Kosako <sndgk393 AT ybb DOT ne DOT jp> + * Copyright (c) 2002-2019 K.Kosako * All rights reserved. * * Redistribution and use in source and binary forms, with or without @@ -163,13 +163,11 @@ extern int onigenc_length_check_is_valid_mbc_string P_((OnigEncoding enc, const /* methods for multi byte encoding */ extern OnigCodePoint onigenc_mbn_mbc_to_code P_((OnigEncoding enc, const UChar* p, const UChar* end)); extern int onigenc_mbn_mbc_case_fold P_((OnigEncoding enc, OnigCaseFoldType flag, const UChar** p, const UChar* end, UChar* lower)); -extern int onigenc_mb2_code_to_mbclen P_((OnigCodePoint code)); extern int onigenc_mb2_code_to_mbc P_((OnigEncoding enc, OnigCodePoint code, UChar *buf)); extern int onigenc_minimum_property_name_to_ctype P_((OnigEncoding enc, UChar* p, UChar* end)); extern int onigenc_unicode_property_name_to_ctype P_((OnigEncoding enc, UChar* p, UChar* end)); extern int onigenc_is_mbc_word_ascii P_((OnigEncoding enc, UChar* s, const UChar* end)); extern int onigenc_mb2_is_code_ctype P_((OnigEncoding enc, OnigCodePoint code, unsigned int ctype)); -extern int onigenc_mb4_code_to_mbclen P_((OnigCodePoint code)); extern int onigenc_mb4_code_to_mbc P_((OnigEncoding enc, OnigCodePoint code, UChar *buf)); extern int onigenc_mb4_is_code_ctype P_((OnigEncoding enc, OnigCodePoint code, unsigned int ctype)); extern struct PropertyNameCtype* onigenc_euc_jp_lookup_property_name P_((register const char *str, register size_t len)); diff --git a/src/regerror.c b/src/regerror.c index e6d1806..b57a276 100644 --- a/src/regerror.c +++ b/src/regerror.c @@ -2,7 +2,7 @@ regerror.c - Oniguruma (regular expression library) **********************************************************************/ /*- - * Copyright (c) 2002-2019 K.Kosako <sndgk393 AT ybb DOT ne DOT jp> + * Copyright (c) 2002-2019 K.Kosako * All rights reserved. * * Redistribution and use in source and binary forms, with or without diff --git a/src/regexec.c b/src/regexec.c index f957b75..ce498c6 100644 --- a/src/regexec.c +++ b/src/regexec.c @@ -2,7 +2,7 @@ regexec.c - Oniguruma (regular expression library) **********************************************************************/ /*- - * Copyright (c) 2002-2019 K.Kosako <sndgk393 AT ybb DOT ne DOT jp> + * Copyright (c) 2002-2019 K.Kosako * All rights reserved. * * Redistribution and use in source and binary forms, with or without @@ -39,6 +39,20 @@ #define CHECK_INTERRUPT_IN_MATCH +#define STACK_MEM_START(reg, i) \ + (MEM_STATUS_AT((reg)->push_mem_start, (i)) != 0 ? \ + STACK_AT(mem_start_stk[i])->u.mem.pstr : (UChar* )((void* )(mem_start_stk[i]))) + +#define STACK_MEM_END(reg, i) \ + (MEM_STATUS_AT((reg)->push_mem_end, (i)) != 0 ? \ + STACK_AT(mem_end_stk[i])->u.mem.pstr : (UChar* )((void* )(mem_end_stk[i]))) + +static int forward_search(regex_t* reg, const UChar* str, const UChar* end, UChar* start, UChar* range, UChar** low, UChar** high, UChar** low_prev); + +static int +search_in_range(regex_t* reg, const UChar* str, const UChar* end, const UChar* start, const UChar* range, /* match range */ const UChar* data_range, /* subject string range */ OnigRegion* region, OnigOptionType option, OnigMatchParam* mp); + + #ifdef USE_CALLOUT typedef struct { int last_match_at_call_counter; @@ -129,7 +143,7 @@ typedef struct { } MatchArg; -#ifdef ONIG_DEBUG +#if defined(ONIG_DEBUG_COMPILE) || defined(ONIG_DEBUG_MATCH) /* arguments type */ typedef enum { @@ -149,102 +163,108 @@ typedef struct { } OpInfoType; static OpInfoType OpInfo[] = { - { OP_FINISH, "finish" }, - { OP_END, "end" }, - { OP_EXACT1, "exact1" }, - { OP_EXACT2, "exact2" }, - { OP_EXACT3, "exact3" }, - { OP_EXACT4, "exact4" }, - { OP_EXACT5, "exact5" }, - { OP_EXACTN, "exactn" }, - { OP_EXACTMB2N1, "exactmb2-n1" }, - { OP_EXACTMB2N2, "exactmb2-n2" }, - { OP_EXACTMB2N3, "exactmb2-n3" }, - { OP_EXACTMB2N, "exactmb2-n" }, - { OP_EXACTMB3N, "exactmb3n" }, - { OP_EXACTMBN, "exactmbn" }, - { OP_EXACT1_IC, "exact1-ic" }, - { OP_EXACTN_IC, "exactn-ic" }, - { OP_CCLASS, "cclass" }, - { OP_CCLASS_MB, "cclass-mb" }, - { OP_CCLASS_MIX, "cclass-mix" }, - { OP_CCLASS_NOT, "cclass-not" }, - { OP_CCLASS_MB_NOT, "cclass-mb-not" }, - { OP_CCLASS_MIX_NOT, "cclass-mix-not" }, - { OP_ANYCHAR, "anychar" }, - { OP_ANYCHAR_ML, "anychar-ml" }, - { OP_ANYCHAR_STAR, "anychar*" }, - { OP_ANYCHAR_ML_STAR, "anychar-ml*" }, - { OP_ANYCHAR_STAR_PEEK_NEXT, "anychar*-peek-next" }, - { OP_ANYCHAR_ML_STAR_PEEK_NEXT, "anychar-ml*-peek-next" }, - { OP_WORD, "word" }, - { OP_WORD_ASCII, "word-ascii" }, - { OP_NO_WORD, "not-word" }, - { OP_NO_WORD_ASCII, "not-word-ascii" }, - { OP_WORD_BOUNDARY, "word-boundary" }, - { OP_NO_WORD_BOUNDARY, "not-word-boundary" }, - { OP_WORD_BEGIN, "word-begin" }, - { OP_WORD_END, "word-end" }, - { OP_TEXT_SEGMENT_BOUNDARY, "text-segment-boundary" }, - { OP_BEGIN_BUF, "begin-buf" }, - { OP_END_BUF, "end-buf" }, - { OP_BEGIN_LINE, "begin-line" }, - { OP_END_LINE, "end-line" }, - { OP_SEMI_END_BUF, "semi-end-buf" }, - { OP_BEGIN_POSITION, "begin-position" }, - { OP_BACKREF1, "backref1" }, - { OP_BACKREF2, "backref2" }, - { OP_BACKREF_N, "backref-n" }, - { OP_BACKREF_N_IC, "backref-n-ic" }, - { OP_BACKREF_MULTI, "backref_multi" }, - { OP_BACKREF_MULTI_IC, "backref_multi-ic" }, - { OP_BACKREF_WITH_LEVEL, "backref_with_level" }, - { OP_BACKREF_WITH_LEVEL_IC, "backref_with_level-c" }, - { OP_BACKREF_CHECK, "backref_check" }, - { OP_BACKREF_CHECK_WITH_LEVEL, "backref_check_with_level" }, - { OP_MEMORY_START_PUSH, "mem-start-push" }, - { OP_MEMORY_START, "mem-start" }, - { OP_MEMORY_END_PUSH, "mem-end-push" }, - { OP_MEMORY_END_PUSH_REC, "mem-end-push-rec" }, - { OP_MEMORY_END, "mem-end" }, - { OP_MEMORY_END_REC, "mem-end-rec" }, - { OP_FAIL, "fail" }, - { OP_JUMP, "jump" }, - { OP_PUSH, "push" }, - { OP_PUSH_SUPER, "push-super" }, - { OP_POP_OUT, "pop-out" }, + { OP_FINISH, "finish"}, + { OP_END, "end"}, + { OP_STR_1, "str_1"}, + { OP_STR_2, "str_2"}, + { OP_STR_3, "str_3"}, + { OP_STR_4, "str_4"}, + { OP_STR_5, "str_5"}, + { OP_STR_N, "str_n"}, + { OP_STR_MB2N1, "str_mb2-n1"}, + { OP_STR_MB2N2, "str_mb2-n2"}, + { OP_STR_MB2N3, "str_mb2-n3"}, + { OP_STR_MB2N, "str_mb2-n"}, + { OP_STR_MB3N, "str_mb3n"}, + { OP_STR_MBN, "str_mbn"}, + { OP_STR_1_IC, "str_1-ic"}, + { OP_STR_N_IC, "str_n-ic"}, + { OP_CCLASS, "cclass"}, + { OP_CCLASS_MB, "cclass-mb"}, + { OP_CCLASS_MIX, "cclass-mix"}, + { OP_CCLASS_NOT, "cclass-not"}, + { OP_CCLASS_MB_NOT, "cclass-mb-not"}, + { OP_CCLASS_MIX_NOT, "cclass-mix-not"}, + { OP_ANYCHAR, "anychar"}, + { OP_ANYCHAR_ML, "anychar-ml"}, + { OP_ANYCHAR_STAR, "anychar*"}, + { OP_ANYCHAR_ML_STAR, "anychar-ml*"}, + { OP_ANYCHAR_STAR_PEEK_NEXT, "anychar*-peek-next"}, + { OP_ANYCHAR_ML_STAR_PEEK_NEXT, "anychar-ml*-peek-next"}, + { OP_WORD, "word"}, + { OP_WORD_ASCII, "word-ascii"}, + { OP_NO_WORD, "not-word"}, + { OP_NO_WORD_ASCII, "not-word-ascii"}, + { OP_WORD_BOUNDARY, "word-boundary"}, + { OP_NO_WORD_BOUNDARY, "not-word-boundary"}, + { OP_WORD_BEGIN, "word-begin"}, + { OP_WORD_END, "word-end"}, + { OP_TEXT_SEGMENT_BOUNDARY, "text-segment-boundary"}, + { OP_BEGIN_BUF, "begin-buf"}, + { OP_END_BUF, "end-buf"}, + { OP_BEGIN_LINE, "begin-line"}, + { OP_END_LINE, "end-line"}, + { OP_SEMI_END_BUF, "semi-end-buf"}, + { OP_BEGIN_POSITION, "begin-position"}, + { OP_BACKREF1, "backref1"}, + { OP_BACKREF2, "backref2"}, + { OP_BACKREF_N, "backref-n"}, + { OP_BACKREF_N_IC, "backref-n-ic"}, + { OP_BACKREF_MULTI, "backref_multi"}, + { OP_BACKREF_MULTI_IC, "backref_multi-ic"}, + { OP_BACKREF_WITH_LEVEL, "backref_with_level"}, + { OP_BACKREF_WITH_LEVEL_IC, "backref_with_level-c"}, + { OP_BACKREF_CHECK, "backref_check"}, + { OP_BACKREF_CHECK_WITH_LEVEL, "backref_check_with_level"}, + { OP_MEM_START_PUSH, "mem-start-push"}, + { OP_MEM_START, "mem-start"}, + { OP_MEM_END_PUSH, "mem-end-push"}, +#ifdef USE_CALL + { OP_MEM_END_PUSH_REC, "mem-end-push-rec"}, +#endif + { OP_MEM_END, "mem-end"}, +#ifdef USE_CALL + { OP_MEM_END_REC, "mem-end-rec"}, +#endif + { OP_FAIL, "fail"}, + { OP_JUMP, "jump"}, + { OP_PUSH, "push"}, + { OP_PUSH_SUPER, "push-super"}, + { OP_POP_OUT, "pop-out"}, #ifdef USE_OP_PUSH_OR_JUMP_EXACT - { OP_PUSH_OR_JUMP_EXACT1, "push-or-jump-e1" }, + { OP_PUSH_OR_JUMP_EXACT1, "push-or-jump-e1"}, +#endif + { OP_PUSH_IF_PEEK_NEXT, "push-if-peek-next"}, + { OP_REPEAT, "repeat"}, + { OP_REPEAT_NG, "repeat-ng"}, + { OP_REPEAT_INC, "repeat-inc"}, + { OP_REPEAT_INC_NG, "repeat-inc-ng"}, + { OP_EMPTY_CHECK_START, "empty-check-start"}, + { OP_EMPTY_CHECK_END, "empty-check-end"}, + { OP_EMPTY_CHECK_END_MEMST, "empty-check-end-memst"}, +#ifdef USE_CALL + { OP_EMPTY_CHECK_END_MEMST_PUSH,"empty-check-end-memst-push"}, +#endif + { OP_PREC_READ_START, "push-pos"}, + { OP_PREC_READ_END, "pop-pos"}, + { OP_PREC_READ_NOT_START, "prec-read-not-start"}, + { OP_PREC_READ_NOT_END, "prec-read-not-end"}, + { OP_ATOMIC_START, "atomic-start"}, + { OP_ATOMIC_END, "atomic-end"}, + { OP_LOOK_BEHIND, "look-behind"}, + { OP_LOOK_BEHIND_NOT_START, "look-behind-not-start"}, + { OP_LOOK_BEHIND_NOT_END, "look-behind-not-end"}, + { OP_PUSH_SAVE_VAL, "push-save-val"}, + { OP_UPDATE_VAR, "update-var"}, +#ifdef USE_CALL + { OP_CALL, "call"}, + { OP_RETURN, "return"}, #endif - { OP_PUSH_IF_PEEK_NEXT, "push-if-peek-next" }, - { OP_REPEAT, "repeat" }, - { OP_REPEAT_NG, "repeat-ng" }, - { OP_REPEAT_INC, "repeat-inc" }, - { OP_REPEAT_INC_NG, "repeat-inc-ng" }, - { OP_REPEAT_INC_SG, "repeat-inc-sg" }, - { OP_REPEAT_INC_NG_SG, "repeat-inc-ng-sg" }, - { OP_EMPTY_CHECK_START, "empty-check-start" }, - { OP_EMPTY_CHECK_END, "empty-check-end" }, - { OP_EMPTY_CHECK_END_MEMST, "empty-check-end-memst" }, - { OP_EMPTY_CHECK_END_MEMST_PUSH,"empty-check-end-memst-push" }, - { OP_PREC_READ_START, "push-pos" }, - { OP_PREC_READ_END, "pop-pos" }, - { OP_PREC_READ_NOT_START, "prec-read-not-start" }, - { OP_PREC_READ_NOT_END, "prec-read-not-end" }, - { OP_ATOMIC_START, "atomic-start" }, - { OP_ATOMIC_END, "atomic-end" }, - { OP_LOOK_BEHIND, "look-behind" }, - { OP_LOOK_BEHIND_NOT_START, "look-behind-not-start" }, - { OP_LOOK_BEHIND_NOT_END, "look-behind-not-end" }, - { OP_CALL, "call" }, - { OP_RETURN, "return" }, - { OP_PUSH_SAVE_VAL, "push-save-val" }, - { OP_UPDATE_VAR, "update-var" }, #ifdef USE_CALLOUT - { OP_CALLOUT_CONTENTS, "callout-contents" }, - { OP_CALLOUT_NAME, "callout-name" }, + { OP_CALLOUT_CONTENTS, "callout-contents"}, + { OP_CALLOUT_NAME, "callout-name"}, #endif - { -1, "" } + { -1, ""} }; static char* @@ -320,32 +340,32 @@ print_compiled_byte_code(FILE* f, regex_t* reg, int index, fprintf(f, "%s", op2name(opcode)); switch (opcode) { - case OP_EXACT1: + case OP_STR_1: p_string(f, 1, p->exact.s); break; - case OP_EXACT2: + case OP_STR_2: p_string(f, 2, p->exact.s); break; - case OP_EXACT3: + case OP_STR_3: p_string(f, 3, p->exact.s); break; - case OP_EXACT4: + case OP_STR_4: p_string(f, 4, p->exact.s); break; - case OP_EXACT5: + case OP_STR_5: p_string(f, 5, p->exact.s); break; - case OP_EXACTN: + case OP_STR_N: len = p->exact_n.n; p_string(f, len, p->exact_n.s); break; - case OP_EXACTMB2N1: + case OP_STR_MB2N1: p_string(f, 2, p->exact.s); break; - case OP_EXACTMB2N2: + case OP_STR_MB2N2: p_string(f, 4, p->exact.s); break; - case OP_EXACTMB2N3: + case OP_STR_MB2N3: p_string(f, 3, p->exact.s); break; - case OP_EXACTMB2N: + case OP_STR_MB2N: len = p->exact_n.n; p_len_string(f, len, 2, p->exact_n.s); break; - case OP_EXACTMB3N: + case OP_STR_MB3N: len = p->exact_n.n; p_len_string(f, len, 3, p->exact_n.s); break; - case OP_EXACTMBN: + case OP_STR_MBN: { int mb_len; @@ -357,11 +377,11 @@ print_compiled_byte_code(FILE* f, regex_t* reg, int index, while (n-- > 0) { fputc(*q++, f); } } break; - case OP_EXACT1_IC: + case OP_STR_1_IC: len = enclen(enc, p->exact.s); p_string(f, len, p->exact.s); break; - case OP_EXACTN_IC: + case OP_STR_N_IC: len = p->exact_n.n; p_len_string(f, len, 1, p->exact_n.s); break; @@ -375,13 +395,13 @@ print_compiled_byte_code(FILE* f, regex_t* reg, int index, case OP_CCLASS_MB_NOT: { OnigCodePoint ncode; - OnigCodePoint* codes; + OnigCodePoint* codes; codes = (OnigCodePoint* )p->cclass_mb.mb; GET_CODE_POINT(ncode, codes); codes++; GET_CODE_POINT(code, codes); - fprintf(f, ":%u:%u", code, ncode); + fprintf(f, ":%d:0x%x", ncode, code); } break; case OP_CCLASS_MIX: @@ -447,15 +467,18 @@ print_compiled_byte_code(FILE* f, regex_t* reg, int index, } break; - case OP_MEMORY_START: - case OP_MEMORY_START_PUSH: + case OP_MEM_START: + case OP_MEM_START_PUSH: mem = p->memory_start.num; fprintf(f, ":%d", mem); break; - case OP_MEMORY_END_PUSH: - case OP_MEMORY_END_PUSH_REC: - case OP_MEMORY_END: - case OP_MEMORY_END_REC: + + case OP_MEM_END: + case OP_MEM_END_PUSH: +#ifdef USE_CALL + case OP_MEM_END_REC: + case OP_MEM_END_PUSH_REC: +#endif mem = p->memory_end.num; fprintf(f, ":%d", mem); break; @@ -499,8 +522,6 @@ print_compiled_byte_code(FILE* f, regex_t* reg, int index, case OP_REPEAT_INC: case OP_REPEAT_INC_NG: - case OP_REPEAT_INC_SG: - case OP_REPEAT_INC_NG_SG: mem = p->repeat.id; fprintf(f, ":%d", mem); break; @@ -511,7 +532,9 @@ print_compiled_byte_code(FILE* f, regex_t* reg, int index, break; case OP_EMPTY_CHECK_END: case OP_EMPTY_CHECK_END_MEMST: +#ifdef USE_CALL case OP_EMPTY_CHECK_END_MEMST_PUSH: +#endif mem = p->empty_check_end.mem; fprintf(f, ":%d", mem); break; @@ -534,10 +557,12 @@ print_compiled_byte_code(FILE* f, regex_t* reg, int index, p_rel_addr(f, addr, p, start); break; +#ifdef USE_CALL case OP_CALL: addr = p->call.addr; fprintf(f, ":{/%d}", addr); break; +#endif case OP_PUSH_SAVE_VAL: { @@ -607,7 +632,9 @@ print_compiled_byte_code(FILE* f, regex_t* reg, int index, case OP_ATOMIC_START: case OP_ATOMIC_END: case OP_LOOK_BEHIND_NOT_END: +#ifdef USE_CALL case OP_RETURN: +#endif break; default: @@ -615,7 +642,7 @@ print_compiled_byte_code(FILE* f, regex_t* reg, int index, break; } } -#endif /* ONIG_DEBUG */ +#endif /* defined(ONIG_DEBUG_COMPILE) || defined(ONIG_DEBUG_MATCH) */ #ifdef ONIG_DEBUG_COMPILE extern void @@ -625,8 +652,8 @@ onig_print_compiled_byte_code_list(FILE* f, regex_t* reg) Operation* start = reg->ops; Operation* end = reg->ops + reg->ops_used; - fprintf(f, "bt_mem_start: 0x%x, bt_mem_end: 0x%x\n", - reg->bt_mem_start, reg->bt_mem_end); + fprintf(f, "push_mem_start: 0x%x, push_mem_end: 0x%x\n", + reg->push_mem_start, reg->push_mem_end); fprintf(f, "code-length: %d\n", reg->ops_used); bp = start; @@ -943,7 +970,7 @@ onig_region_copy(OnigRegion* to, OnigRegion* from) result = ONIGERR_INVALID_ARGUMENT;\ }\ best_len = result;\ - goto finish;\ + goto match_at_end;\ break;\ }\ } while(0) @@ -965,18 +992,26 @@ onig_region_copy(OnigRegion* to, OnigRegion* from) /* handled by normal-POP */ #define STK_MEM_START 0x0010 #define STK_MEM_END 0x8030 -#define STK_REPEAT_INC 0x0050 +#ifdef USE_REPEAT_AND_EMPTY_CHECK_LOCAL_VAR +#define STK_REPEAT_INC (0x0040 | STK_MASK_POP_HANDLED) +#else +#define STK_REPEAT_INC 0x0040 +#endif #ifdef USE_CALLOUT #define STK_CALLOUT 0x0070 #endif /* avoided by normal-POP */ #define STK_VOID 0x0000 /* for fill a blank */ +#ifdef USE_REPEAT_AND_EMPTY_CHECK_LOCAL_VAR +#define STK_EMPTY_CHECK_START (0x3000 | STK_MASK_POP_HANDLED) +#else #define STK_EMPTY_CHECK_START 0x3000 +#endif #define STK_EMPTY_CHECK_END 0x5000 /* for recursive call */ #define STK_MEM_END_MARK 0x8100 #define STK_TO_VOID_START 0x1200 /* mark for "(?>...)" */ -#define STK_REPEAT 0x0300 +/* #define STK_REPEAT 0x0300 */ #define STK_CALL_FRAME 0x0400 #define STK_RETURN 0x0500 #define STK_SAVE_VAL 0x0600 @@ -1002,11 +1037,10 @@ typedef struct _StackType { UChar* pstr_prev; /* previous char position of pstr */ } state; struct { - int count; /* for OP_REPEAT_INC, OP_REPEAT_INC_NG */ - Operation* pcode; /* byte code position (head of repeated target) */ - } repeat; - struct { - StackIndex si; /* index of stack */ + int count; +#ifdef USE_REPEAT_AND_EMPTY_CHECK_LOCAL_VAR + StackIndex prev_index; /* index of stack */ +#endif } repeat_inc; struct { UChar *pstr; /* start/end position */ @@ -1015,7 +1049,10 @@ typedef struct _StackType { StackIndex prev_end; /* prev. info (for backtrack "(...)*" ) */ } mem; struct { - UChar *pstr; /* start position */ + UChar *pstr; /* start position */ +#ifdef USE_REPEAT_AND_EMPTY_CHECK_LOCAL_VAR + StackIndex prev_index; /* index of stack */ +#endif } empty_check; #ifdef USE_CALL struct { @@ -1061,29 +1098,64 @@ struct OnigCalloutArgsStruct { #endif +#ifdef USE_REPEAT_AND_EMPTY_CHECK_LOCAL_VAR + +#define PTR_NUM_SIZE(reg) ((reg)->num_repeat + (reg)->num_empty_check + ((reg)->num_mem + 1) * 2) +#define UPDATE_FOR_STACK_REALLOC do{\ + repeat_stk = (StackIndex* )alloc_base;\ + empty_check_stk = (StackIndex* )(repeat_stk + reg->num_repeat);\ + mem_start_stk = (StackIndex* )(empty_check_stk + reg->num_empty_check);\ + mem_end_stk = mem_start_stk + num_mem + 1;\ +} while(0) + +#define SAVE_REPEAT_STK_VAR(sid) stk->u.repeat_inc.prev_index = repeat_stk[sid] +#define LOAD_TO_REPEAT_STK_VAR(sid) repeat_stk[sid] = GET_STACK_INDEX(stk) +#define POP_REPEAT_INC else if (stk->type == STK_REPEAT_INC) {repeat_stk[stk->zid] = stk->u.repeat_inc.prev_index;} + +#define SAVE_EMPTY_CHECK_STK_VAR(sid) stk->u.empty_check.prev_index = empty_check_stk[sid] +#define LOAD_TO_EMPTY_CHECK_STK_VAR(sid) empty_check_stk[sid] = GET_STACK_INDEX(stk) +#define POP_EMPTY_CHECK_START else if (stk->type == STK_EMPTY_CHECK_START) {empty_check_stk[stk->zid] = stk->u.empty_check.prev_index;} + +#else + +#define PTR_NUM_SIZE(reg) (((reg)->num_mem + 1) * 2) +#define UPDATE_FOR_STACK_REALLOC do{\ + mem_start_stk = (StackIndex* )alloc_base;\ + mem_end_stk = mem_start_stk + num_mem + 1;\ +} while(0) + +#define SAVE_REPEAT_STK_VAR(sid) +#define LOAD_TO_REPEAT_STK_VAR(sid) +#define POP_REPEAT_INC + +#define SAVE_EMPTY_CHECK_STK_VAR(sid) +#define LOAD_TO_EMPTY_CHECK_STK_VAR(sid) +#define POP_EMPTY_CHECK_START + +#endif /* USE_REPEAT_AND_EMPTY_CHECK_LOCAL_VAR */ #ifdef USE_FIND_LONGEST_SEARCH_ALL_OF_RANGE -#define MATCH_ARG_INIT(msa, reg, arg_option, arg_region, arg_start, mp) do { \ +#define MATCH_ARG_INIT(msa, reg, arg_option, arg_region, arg_start, mpv) do { \ (msa).stack_p = (void* )0;\ (msa).options = (arg_option);\ (msa).region = (arg_region);\ (msa).start = (arg_start);\ - (msa).match_stack_limit = (mp)->match_stack_limit;\ - (msa).retry_limit_in_match = (mp)->retry_limit_in_match;\ - (msa).mp = mp;\ + (msa).match_stack_limit = (mpv)->match_stack_limit;\ + (msa).retry_limit_in_match = (mpv)->retry_limit_in_match;\ + (msa).mp = mpv;\ (msa).best_len = ONIG_MISMATCH;\ - (msa).ptr_num = (reg)->num_repeat + ((reg)->num_mem + 1) * 2; \ + (msa).ptr_num = PTR_NUM_SIZE(reg);\ } while(0) #else -#define MATCH_ARG_INIT(msa, reg, arg_option, arg_region, arg_start, mp) do { \ +#define MATCH_ARG_INIT(msa, reg, arg_option, arg_region, arg_start, mpv) do { \ (msa).stack_p = (void* )0;\ (msa).options = (arg_option);\ (msa).region = (arg_region);\ (msa).start = (arg_start);\ - (msa).match_stack_limit = (mp)->match_stack_limit;\ - (msa).retry_limit_in_match = (mp)->retry_limit_in_match;\ - (msa).mp = mp;\ - (msa).ptr_num = (reg)->num_repeat + ((reg)->num_mem + 1) * 2; \ + (msa).match_stack_limit = (mpv)->match_stack_limit;\ + (msa).retry_limit_in_match = (mpv)->retry_limit_in_match;\ + (msa).mp = mpv;\ + (msa).ptr_num = PTR_NUM_SIZE(reg);\ } while(0) #endif @@ -1138,12 +1210,6 @@ struct OnigCalloutArgsStruct { };\ } while(0) -#define UPDATE_FOR_STACK_REALLOC do{\ - repeat_stk = (StackIndex* )alloc_base;\ - mem_start_stk = (StackIndex* )(repeat_stk + reg->num_repeat);\ - mem_end_stk = mem_start_stk + num_mem + 1;\ -} while(0) - static unsigned int MatchStackLimit = DEFAULT_MATCH_STACK_LIMIT_SIZE; extern unsigned int @@ -1164,7 +1230,9 @@ onig_set_match_stack_limit_size(unsigned int size) static unsigned long RetryLimitInMatch = DEFAULT_RETRY_LIMIT_IN_MATCH; #define CHECK_RETRY_LIMIT_IN_MATCH do {\ - if (retry_in_match_counter++ > retry_limit_in_match) goto retry_limit_in_match_over;\ + if (retry_in_match_counter++ > retry_limit_in_match) {\ + MATCH_AT_ERROR_RETURN(ONIGERR_RETRY_LIMIT_IN_MATCH_OVER);\ + }\ } while (0) #else @@ -1554,19 +1622,23 @@ stack_double(int is_alloca, char** arg_alloc_base, #define STACK_PUSH_ALT_LOOK_BEHIND_NOT(pat,s,sprev) \ STACK_PUSH(STK_ALT_LOOK_BEHIND_NOT,pat,s,sprev) +#if 0 #define STACK_PUSH_REPEAT(sid, pat) do {\ STACK_ENSURE(1);\ stk->type = STK_REPEAT;\ stk->zid = (sid);\ - stk->u.repeat.pcode = (pat);\ - stk->u.repeat.count = 0;\ + stk->u.repeat.pcode = (pat);\ STACK_INC;\ } while(0) +#endif -#define STACK_PUSH_REPEAT_INC(sindex) do {\ +#define STACK_PUSH_REPEAT_INC(sid, ct) do {\ STACK_ENSURE(1);\ stk->type = STK_REPEAT_INC;\ - stk->u.repeat_inc.si = (sindex);\ + stk->zid = (sid);\ + stk->u.repeat_inc.count = (ct);\ + SAVE_REPEAT_STK_VAR(sid);\ + LOAD_TO_REPEAT_STK_VAR(sid);\ STACK_INC;\ } while(0) @@ -1639,6 +1711,8 @@ stack_double(int is_alloca, char** arg_alloc_base, stk->type = STK_EMPTY_CHECK_START;\ stk->zid = (cnum);\ stk->u.empty_check.pstr = (s);\ + SAVE_EMPTY_CHECK_STK_VAR(cnum);\ + LOAD_TO_EMPTY_CHECK_STK_VAR(cnum);\ STACK_INC;\ } while(0) @@ -1776,7 +1850,7 @@ stack_double(int is_alloca, char** arg_alloc_base, #define STACK_BASE_CHECK(p, at) \ if ((p) < stk_base) {\ fprintf(stderr, "at %s\n", at);\ - goto stack_error;\ + MATCH_AT_ERROR_RETURN(ONIGERR_STACK_BUG);\ } #else #define STACK_BASE_CHECK(p, at) @@ -1827,13 +1901,12 @@ stack_double(int is_alloca, char** arg_alloc_base, mem_start_stk[stk->zid] = stk->u.mem.prev_start;\ mem_end_stk[stk->zid] = stk->u.mem.prev_end;\ }\ - else if (stk->type == STK_REPEAT_INC) {\ - STACK_AT(stk->u.repeat_inc.si)->u.repeat.count--;\ - }\ else if (stk->type == STK_MEM_END) {\ mem_start_stk[stk->zid] = stk->u.mem.prev_start;\ mem_end_stk[stk->zid] = stk->u.mem.prev_end;\ }\ + POP_REPEAT_INC \ + POP_EMPTY_CHECK_START \ POP_CALLOUT_CASE\ }\ }\ @@ -1852,13 +1925,12 @@ stack_double(int is_alloca, char** arg_alloc_base, mem_start_stk[stk->zid] = stk->u.mem.prev_start;\ mem_end_stk[stk->zid] = stk->u.mem.prev_end;\ }\ - else if (stk->type == STK_REPEAT_INC) {\ - STACK_AT(stk->u.repeat_inc.si)->u.repeat.count--;\ - }\ else if (stk->type == STK_MEM_END) {\ mem_start_stk[stk->zid] = stk->u.mem.prev_start;\ mem_end_stk[stk->zid] = stk->u.mem.prev_end;\ }\ + POP_REPEAT_INC \ + POP_EMPTY_CHECK_START \ /* Don't call callout here because negation of total success by (?!..) (?<!..) */\ }\ }\ @@ -1910,26 +1982,47 @@ stack_double(int is_alloca, char** arg_alloc_base, }\ } while(0) -#define STACK_EMPTY_CHECK(isnull,sid,s) do {\ - StackType* k = stk;\ + +#define EMPTY_CHECK_START_SEARCH(sid, k) do {\ + k = stk;\ while (1) {\ k--;\ - STACK_BASE_CHECK(k, "STACK_EMPTY_CHECK"); \ + STACK_BASE_CHECK(k, "EMPTY_CHECK_START_SEARCH"); \ if (k->type == STK_EMPTY_CHECK_START) {\ - if (k->zid == (sid)) {\ - (isnull) = (k->u.empty_check.pstr == (s));\ - break;\ - }\ + if (k->zid == (sid)) break;\ }\ }\ } while(0) +#ifdef USE_REPEAT_AND_EMPTY_CHECK_LOCAL_VAR + +#define GET_EMPTY_CHECK_START(sid, k) do {\ + if (reg->num_call == 0) {\ + k = STACK_AT(empty_check_stk[sid]);\ + }\ + else {\ + EMPTY_CHECK_START_SEARCH(sid, k);\ + }\ +} while(0) +#else + +#define GET_EMPTY_CHECK_START(sid, k) EMPTY_CHECK_START_SEARCH(sid, k) + +#endif + + +#define STACK_EMPTY_CHECK(isnull, sid, s) do {\ + StackType* k;\ + GET_EMPTY_CHECK_START(sid, k);\ + (isnull) = (k->u.empty_check.pstr == (s));\ +} while(0) + #define STACK_MEM_START_GET_PREV_END_ADDR(k /* STK_MEM_START*/, reg, addr) do {\ if (k->u.mem.prev_end == INVALID_STACK_INDEX) {\ (addr) = 0;\ }\ else {\ - if (MEM_STATUS_AT((reg)->bt_mem_end, k->zid))\ + if (MEM_STATUS_AT((reg)->push_mem_end, k->zid))\ (addr) = STACK_AT(k->u.mem.prev_end)->u.mem.pstr;\ else\ (addr) = (UChar* )k->u.mem.prev_end;\ @@ -1937,45 +2030,30 @@ stack_double(int is_alloca, char** arg_alloc_base, } while (0) #ifdef USE_STUBBORN_CHECK_CAPTURES_IN_EMPTY_REPEAT -#define STACK_EMPTY_CHECK_MEM(isnull,sid,s,reg) do {\ - StackType* k = stk;\ - while (1) {\ - k--;\ - STACK_BASE_CHECK(k, "STACK_EMPTY_CHECK_MEM"); \ - if (k->type == STK_EMPTY_CHECK_START) {\ - if (k->zid == (sid)) {\ - if (k->u.empty_check.pstr != (s)) {\ - (isnull) = 0;\ - break;\ +#define STACK_EMPTY_CHECK_MEM(isnull, sid, s, reg) do {\ + StackType* k;\ + GET_EMPTY_CHECK_START(sid, k);\ + if (k->u.empty_check.pstr != (s)) {\ + (isnull) = 0;\ + }\ + else {\ + UChar* endp;\ + (isnull) = 1;\ + while (k < stk) {\ + if (k->type == STK_MEM_START &&\ + MEM_STATUS_LIMIT_AT((reg)->empty_status_mem, k->zid)) {\ + STACK_MEM_START_GET_PREV_END_ADDR(k, reg, endp);\ + if (endp == 0) {\ + (isnull) = 0; break;\ }\ - else {\ - UChar* endp;\ - int level = 0;\ - (isnull) = 1;\ - while (k < stk) {\ - if (k->type == STK_MEM_START && level == 0) {\ - STACK_MEM_START_GET_PREV_END_ADDR(k, reg, endp);\ - if (endp == 0) {\ - (isnull) = 0; break;\ - }\ - else if (STACK_AT(k->u.mem.prev_start)->u.mem.pstr != endp) {\ - (isnull) = 0; break;\ - }\ - else if (endp != s) {\ - (isnull) = -1; /* empty, but position changed */ \ - }\ - }\ - else if (k->type == STK_PREC_READ_START) {\ - level++;\ - }\ - else if (k->type == STK_PREC_READ_END) {\ - level--;\ - }\ - k++;\ - }\ - break;\ + else if (STACK_AT(k->u.mem.prev_start)->u.mem.pstr != endp) {\ + (isnull) = 0; break;\ + }\ + else if (endp != s) {\ + (isnull) = -1; /* empty, but position changed */ \ }\ }\ + k++;\ }\ }\ } while(0) @@ -1995,11 +2073,11 @@ stack_double(int is_alloca, char** arg_alloc_base, }\ else {\ UChar* endp;\ - int prec_level = 0;\ (isnull) = 1;\ while (k < stk) {\ if (k->type == STK_MEM_START) {\ - if (level == 0 && prec_level == 0) {\ + if (level == 0 && \ + MEM_STATUS_LIMIT_AT((reg)->empty_status_mem, k->zid) !=0) {\ STACK_MEM_START_GET_PREV_END_ADDR(k, reg, endp);\ if (endp == 0) {\ (isnull) = 0; break;\ @@ -2018,12 +2096,6 @@ stack_double(int is_alloca, char** arg_alloc_base, else if (k->type == STK_EMPTY_CHECK_END) {\ if (k->zid == (sid)) level--;\ }\ - else if (k->type == STK_PREC_READ_START) {\ - prec_level++;\ - }\ - else if (k->type == STK_PREC_READ_END) {\ - prec_level--;\ - }\ k++;\ }\ break;\ @@ -2062,24 +2134,45 @@ stack_double(int is_alloca, char** arg_alloc_base, } while(0) #endif /* USE_STUBBORN_CHECK_CAPTURES_IN_EMPTY_REPEAT */ -#define STACK_GET_REPEAT(sid, k) do {\ - int level = 0;\ - k = stk;\ +#define STACK_GET_REPEAT_COUNT_SEARCH(sid, c) do {\ + StackType* k = stk;\ while (1) {\ - k--;\ - STACK_BASE_CHECK(k, "STACK_GET_REPEAT"); \ - if (k->type == STK_REPEAT) {\ - if (level == 0) {\ - if (k->zid == (sid)) {\ - break;\ + (k)--;\ + STACK_BASE_CHECK(k, "STACK_GET_REPEAT_COUNT_SEARCH");\ + if ((k)->type == STK_REPEAT_INC) {\ + if ((k)->zid == (sid)) {\ + (c) = (k)->u.repeat_inc.count;\ + break;\ + }\ + }\ + else if ((k)->type == STK_RETURN) {\ + int level = -1;\ + while (1) {\ + (k)--;\ + if ((k)->type == STK_CALL_FRAME) {\ + level++;\ + if (level == 0) break;\ }\ + else if ((k)->type == STK_RETURN) level--;\ }\ }\ - else if (k->type == STK_CALL_FRAME) level--;\ - else if (k->type == STK_RETURN) level++;\ }\ } while(0) +#ifdef USE_REPEAT_AND_EMPTY_CHECK_LOCAL_VAR + +#define STACK_GET_REPEAT_COUNT(sid, c) do {\ + if (reg->num_call == 0) {\ + (c) = (STACK_AT(repeat_stk[sid]))->u.repeat_inc.count;\ + }\ + else {\ + STACK_GET_REPEAT_COUNT_SEARCH(sid, c);\ + }\ +} while(0) +#else +#define STACK_GET_REPEAT_COUNT(sid, c) STACK_GET_REPEAT_COUNT_SEARCH(sid, c) +#endif + #define STACK_RETURN(addr) do {\ int level = 0;\ StackType* k = stk;\ @@ -2481,6 +2574,8 @@ typedef struct { #define MATCH_DEBUG_OUT(offset) #endif +#define MATCH_AT_ERROR_RETURN(err_code) best_len = err_code; goto match_at_end + /* match data(str - end) from position (sstart). */ /* if sstart == str then set sprev to NULL. */ @@ -2500,20 +2595,20 @@ match_at(regex_t* reg, const UChar* str, const UChar* end, static const void *opcode_to_label[] = { &&L_FINISH, &&L_END, - &&L_EXACT1, - &&L_EXACT2, - &&L_EXACT3, - &&L_EXACT4, - &&L_EXACT5, - &&L_EXACTN, - &&L_EXACTMB2N1, - &&L_EXACTMB2N2, - &&L_EXACTMB2N3, - &&L_EXACTMB2N, - &&L_EXACTMB3N, - &&L_EXACTMBN, - &&L_EXACT1_IC, - &&L_EXACTN_IC, + &&L_STR_1, + &&L_STR_2, + &&L_STR_3, + &&L_STR_4, + &&L_STR_5, + &&L_STR_N, + &&L_STR_MB2N1, + &&L_STR_MB2N2, + &&L_STR_MB2N3, + &&L_STR_MB2N, + &&L_STR_MB3N, + &&L_STR_MBN, + &&L_STR_1_IC, + &&L_STR_N_IC, &&L_CCLASS, &&L_CCLASS_MB, &&L_CCLASS_MIX, @@ -2551,12 +2646,16 @@ match_at(regex_t* reg, const UChar* str, const UChar* end, &&L_BACKREF_WITH_LEVEL_IC, &&L_BACKREF_CHECK, &&L_BACKREF_CHECK_WITH_LEVEL, - &&L_MEMORY_START, - &&L_MEMORY_START_PUSH, - &&L_MEMORY_END_PUSH, - &&L_MEMORY_END_PUSH_REC, - &&L_MEMORY_END, - &&L_MEMORY_END_REC, + &&L_MEM_START, + &&L_MEM_START_PUSH, + &&L_MEM_END_PUSH, +#ifdef USE_CALL + &&L_MEM_END_PUSH_REC, +#endif + &&L_MEM_END, +#ifdef USE_CALL + &&L_MEM_END_REC, +#endif &&L_FAIL, &&L_JUMP, &&L_PUSH, @@ -2570,12 +2669,12 @@ match_at(regex_t* reg, const UChar* str, const UChar* end, &&L_REPEAT_NG, &&L_REPEAT_INC, &&L_REPEAT_INC_NG, - &&L_REPEAT_INC_SG, - &&L_REPEAT_INC_NG_SG, &&L_EMPTY_CHECK_START, &&L_EMPTY_CHECK_END, &&L_EMPTY_CHECK_END_MEMST, +#ifdef USE_CALL &&L_EMPTY_CHECK_END_MEMST_PUSH, +#endif &&L_PREC_READ_START, &&L_PREC_READ_END, &&L_PREC_READ_NOT_START, @@ -2585,10 +2684,12 @@ match_at(regex_t* reg, const UChar* str, const UChar* end, &&L_LOOK_BEHIND, &&L_LOOK_BEHIND_NOT_START, &&L_LOOK_BEHIND_NOT_END, - &&L_CALL, - &&L_RETURN, &&L_PUSH_SAVE_VAL, &&L_UPDATE_VAR, +#ifdef USE_CALL + &&L_CALL, + &&L_RETURN, +#endif #ifdef USE_CALLOUT &&L_CALLOUT_CONTENTS, &&L_CALLOUT_NAME, @@ -2606,15 +2707,17 @@ match_at(regex_t* reg, const UChar* str, const UChar* end, char *alloc_base; StackType *stk_base, *stk, *stk_end; StackType *stkp; /* used as any purpose. */ - StackIndex si; - StackIndex *repeat_stk; StackIndex *mem_start_stk, *mem_end_stk; UChar* keep; + +#ifdef USE_REPEAT_AND_EMPTY_CHECK_LOCAL_VAR + StackIndex *repeat_stk; + StackIndex *empty_check_stk; +#endif #ifdef USE_RETRY_LIMIT_IN_MATCH unsigned long retry_limit_in_match; unsigned long retry_in_match_counter; #endif - #ifdef USE_CALLOUT int of; #endif @@ -2700,15 +2803,8 @@ match_at(regex_t* reg, const UChar* str, const UChar* end, rmt[0].rm_eo = (regoff_t )(s - str); for (i = 1; i <= num_mem; i++) { if (mem_end_stk[i] != INVALID_STACK_INDEX) { - if (MEM_STATUS_AT(reg->bt_mem_start, i)) - rmt[i].rm_so = (regoff_t )(STACK_AT(mem_start_stk[i])->u.mem.pstr - str); - else - rmt[i].rm_so = (regoff_t )((UChar* )((void* )(mem_start_stk[i])) - str); - - rmt[i].rm_eo = (regoff_t )((MEM_STATUS_AT(reg->bt_mem_end, i) - ? STACK_AT(mem_end_stk[i])->u.mem.pstr - : (UChar* )((void* )mem_end_stk[i])) - - str); + rmt[i].rm_so = (regoff_t )(STACK_MEM_START(reg, i) - str); + rmt[i].rm_eo = (regoff_t )(STACK_MEM_END(reg, i) - str); } else { rmt[i].rm_so = rmt[i].rm_eo = ONIG_REGION_NOTPOS; @@ -2721,14 +2817,8 @@ match_at(regex_t* reg, const UChar* str, const UChar* end, region->end[0] = (int )(s - str); for (i = 1; i <= num_mem; i++) { if (mem_end_stk[i] != INVALID_STACK_INDEX) { - if (MEM_STATUS_AT(reg->bt_mem_start, i)) - region->beg[i] = (int )(STACK_AT(mem_start_stk[i])->u.mem.pstr - str); - else - region->beg[i] = (int )((UChar* )((void* )mem_start_stk[i]) - str); - - region->end[i] = (int )((MEM_STATUS_AT(reg->bt_mem_end, i) - ? STACK_AT(mem_end_stk[i])->u.mem.pstr - : (UChar* )((void* )mem_end_stk[i])) - str); + region->beg[i] = (int )(STACK_MEM_START(reg, i) - str); + region->end[i] = (int )(STACK_MEM_END(reg, i) - str); } else { region->beg[i] = region->end[i] = ONIG_REGION_NOTPOS; @@ -2756,10 +2846,7 @@ match_at(regex_t* reg, const UChar* str, const UChar* end, stkp = stk_base; r = make_capture_history_tree(region->history_root, &stkp, stk, (UChar* )str, reg); - if (r < 0) { - best_len = r; /* error code */ - goto finish; - } + if (r < 0) MATCH_AT_ERROR_RETURN(r); } #endif /* USE_CAPTURE_HISTORY */ #ifdef USE_POSIX_API_REGION_OPTION @@ -2784,9 +2871,9 @@ match_at(regex_t* reg, const UChar* str, const UChar* end, } /* default behavior: return first-matching result. */ - goto finish; + goto match_at_end; - CASE_OP(EXACT1) + CASE_OP(STR_1) DATA_ENSURE(1); ps = p->exact.s; if (*ps != *s) goto fail; @@ -2794,7 +2881,7 @@ match_at(regex_t* reg, const UChar* str, const UChar* end, INC_OP; NEXT_OUT; - CASE_OP(EXACT1_IC) + CASE_OP(STR_1_IC) { int len; UChar *q, lowbuf[ONIGENC_MBC_CASE_FOLD_MAXLEN]; @@ -2815,7 +2902,7 @@ match_at(regex_t* reg, const UChar* str, const UChar* end, INC_OP; NEXT_OUT; - CASE_OP(EXACT2) + CASE_OP(STR_2) DATA_ENSURE(2); ps = p->exact.s; if (*ps != *s) goto fail; @@ -2826,7 +2913,7 @@ match_at(regex_t* reg, const UChar* str, const UChar* end, INC_OP; JUMP_OUT; - CASE_OP(EXACT3) + CASE_OP(STR_3) DATA_ENSURE(3); ps = p->exact.s; if (*ps != *s) goto fail; @@ -2839,7 +2926,7 @@ match_at(regex_t* reg, const UChar* str, const UChar* end, INC_OP; JUMP_OUT; - CASE_OP(EXACT4) + CASE_OP(STR_4) DATA_ENSURE(4); ps = p->exact.s; if (*ps != *s) goto fail; @@ -2854,7 +2941,7 @@ match_at(regex_t* reg, const UChar* str, const UChar* end, INC_OP; JUMP_OUT; - CASE_OP(EXACT5) + CASE_OP(STR_5) DATA_ENSURE(5); ps = p->exact.s; if (*ps != *s) goto fail; @@ -2871,7 +2958,7 @@ match_at(regex_t* reg, const UChar* str, const UChar* end, INC_OP; JUMP_OUT; - CASE_OP(EXACTN) + CASE_OP(STR_N) tlen = p->exact_n.n; DATA_ENSURE(tlen); ps = p->exact_n.s; @@ -2882,7 +2969,7 @@ match_at(regex_t* reg, const UChar* str, const UChar* end, INC_OP; JUMP_OUT; - CASE_OP(EXACTN_IC) + CASE_OP(STR_N_IC) { int len; UChar *q, *endp, lowbuf[ONIGENC_MBC_CASE_FOLD_MAXLEN]; @@ -2900,6 +2987,7 @@ match_at(regex_t* reg, const UChar* str, const UChar* end, DATA_ENSURE(0); q = lowbuf; while (len-- > 0) { + if (ps >= endp) goto fail; if (*ps != *q) goto fail; ps++; q++; } @@ -2909,7 +2997,7 @@ match_at(regex_t* reg, const UChar* str, const UChar* end, INC_OP; JUMP_OUT; - CASE_OP(EXACTMB2N1) + CASE_OP(STR_MB2N1) DATA_ENSURE(2); ps = p->exact.s; if (*ps != *s) goto fail; @@ -2919,7 +3007,7 @@ match_at(regex_t* reg, const UChar* str, const UChar* end, INC_OP; NEXT_OUT; - CASE_OP(EXACTMB2N2) + CASE_OP(STR_MB2N2) DATA_ENSURE(4); ps = p->exact.s; if (*ps != *s) goto fail; @@ -2934,7 +3022,7 @@ match_at(regex_t* reg, const UChar* str, const UChar* end, INC_OP; JUMP_OUT; - CASE_OP(EXACTMB2N3) + CASE_OP(STR_MB2N3) DATA_ENSURE(6); ps = p->exact.s; if (*ps != *s) goto fail; @@ -2953,7 +3041,7 @@ match_at(regex_t* reg, const UChar* str, const UChar* end, INC_OP; JUMP_OUT; - CASE_OP(EXACTMB2N) + CASE_OP(STR_MB2N) tlen = p->exact_n.n; DATA_ENSURE(tlen * 2); ps = p->exact_n.s; @@ -2967,7 +3055,7 @@ match_at(regex_t* reg, const UChar* str, const UChar* end, INC_OP; JUMP_OUT; - CASE_OP(EXACTMB3N) + CASE_OP(STR_MB3N) tlen = p->exact_n.n; DATA_ENSURE(tlen * 3); ps = p->exact_n.s; @@ -2983,7 +3071,7 @@ match_at(regex_t* reg, const UChar* str, const UChar* end, INC_OP; JUMP_OUT; - CASE_OP(EXACTMBN) + CASE_OP(STR_MBN) tlen = p->exact_len_n.len; /* mb byte len */ tlen2 = p->exact_len_n.n; /* number of chars */ tlen2 *= tlen; @@ -3014,7 +3102,6 @@ match_at(regex_t* reg, const UChar* str, const UChar* end, UChar *ss; int mb_len; - DATA_ENSURE(1); mb_len = enclen(encode, s); DATA_ENSURE(mb_len); ss = s; @@ -3303,7 +3390,7 @@ match_at(regex_t* reg, const UChar* str, const UChar* end, break; #endif default: - goto bytecode_error; + MATCH_AT_ERROR_RETURN(ONIGERR_UNDEFINED_BYTECODE); break; } @@ -3403,46 +3490,50 @@ match_at(regex_t* reg, const UChar* str, const UChar* end, INC_OP; JUMP_OUT; - CASE_OP(MEMORY_START_PUSH) + CASE_OP(MEM_START_PUSH) mem = p->memory_start.num; STACK_PUSH_MEM_START(mem, s); INC_OP; JUMP_OUT; - CASE_OP(MEMORY_START) + CASE_OP(MEM_START) mem = p->memory_start.num; mem_start_stk[mem] = (StackIndex )((void* )s); INC_OP; JUMP_OUT; - CASE_OP(MEMORY_END_PUSH) + CASE_OP(MEM_END_PUSH) mem = p->memory_end.num; STACK_PUSH_MEM_END(mem, s); INC_OP; JUMP_OUT; - CASE_OP(MEMORY_END) + CASE_OP(MEM_END) mem = p->memory_end.num; mem_end_stk[mem] = (StackIndex )((void* )s); INC_OP; JUMP_OUT; #ifdef USE_CALL - CASE_OP(MEMORY_END_PUSH_REC) - mem = p->memory_end.num; - STACK_GET_MEM_START(mem, stkp); /* should be before push mem-end. */ - si = GET_STACK_INDEX(stkp); - STACK_PUSH_MEM_END(mem, s); - mem_start_stk[mem] = si; - INC_OP; - JUMP_OUT; + CASE_OP(MEM_END_PUSH_REC) + { + StackIndex si; + + mem = p->memory_end.num; + STACK_GET_MEM_START(mem, stkp); /* should be before push mem-end. */ + si = GET_STACK_INDEX(stkp); + STACK_PUSH_MEM_END(mem, s); + mem_start_stk[mem] = si; + INC_OP; + JUMP_OUT; + } - CASE_OP(MEMORY_END_REC) + CASE_OP(MEM_END_REC) mem = p->memory_end.num; mem_end_stk[mem] = (StackIndex )((void* )s); STACK_GET_MEM_START(mem, stkp); - if (MEM_STATUS_AT(reg->bt_mem_start, mem)) + if (MEM_STATUS_AT(reg->push_mem_start, mem)) mem_start_stk[mem] = GET_STACK_INDEX(stkp); else mem_start_stk[mem] = (StackIndex )((void* )stkp->u.mem.pstr); @@ -3470,14 +3561,8 @@ match_at(regex_t* reg, const UChar* str, const UChar* end, if (mem_end_stk[mem] == INVALID_STACK_INDEX) goto fail; if (mem_start_stk[mem] == INVALID_STACK_INDEX) goto fail; - if (MEM_STATUS_AT(reg->bt_mem_start, mem)) - pstart = STACK_AT(mem_start_stk[mem])->u.mem.pstr; - else - pstart = (UChar* )((void* )mem_start_stk[mem]); - - pend = (MEM_STATUS_AT(reg->bt_mem_end, mem) - ? STACK_AT(mem_end_stk[mem])->u.mem.pstr - : (UChar* )((void* )mem_end_stk[mem])); + pstart = STACK_MEM_START(reg, mem); + pend = STACK_MEM_END(reg, mem); n = (int )(pend - pstart); if (n != 0) { DATA_ENSURE(n); @@ -3499,14 +3584,8 @@ match_at(regex_t* reg, const UChar* str, const UChar* end, if (mem_end_stk[mem] == INVALID_STACK_INDEX) goto fail; if (mem_start_stk[mem] == INVALID_STACK_INDEX) goto fail; - if (MEM_STATUS_AT(reg->bt_mem_start, mem)) - pstart = STACK_AT(mem_start_stk[mem])->u.mem.pstr; - else - pstart = (UChar* )((void* )mem_start_stk[mem]); - - pend = (MEM_STATUS_AT(reg->bt_mem_end, mem) - ? STACK_AT(mem_end_stk[mem])->u.mem.pstr - : (UChar* )((void* )mem_end_stk[mem])); + pstart = STACK_MEM_START(reg, mem); + pend = STACK_MEM_END(reg, mem); n = (int )(pend - pstart); if (n != 0) { DATA_ENSURE(n); @@ -3531,14 +3610,8 @@ match_at(regex_t* reg, const UChar* str, const UChar* end, if (mem_end_stk[mem] == INVALID_STACK_INDEX) continue; if (mem_start_stk[mem] == INVALID_STACK_INDEX) continue; - if (MEM_STATUS_AT(reg->bt_mem_start, mem)) - pstart = STACK_AT(mem_start_stk[mem])->u.mem.pstr; - else - pstart = (UChar* )((void* )mem_start_stk[mem]); - - pend = (MEM_STATUS_AT(reg->bt_mem_end, mem) - ? STACK_AT(mem_end_stk[mem])->u.mem.pstr - : (UChar* )((void* )mem_end_stk[mem])); + pstart = STACK_MEM_START(reg, mem); + pend = STACK_MEM_END(reg, mem); n = (int )(pend - pstart); if (n != 0) { DATA_ENSURE(n); @@ -3569,14 +3642,8 @@ match_at(regex_t* reg, const UChar* str, const UChar* end, if (mem_end_stk[mem] == INVALID_STACK_INDEX) continue; if (mem_start_stk[mem] == INVALID_STACK_INDEX) continue; - if (MEM_STATUS_AT(reg->bt_mem_start, mem)) - pstart = STACK_AT(mem_start_stk[mem])->u.mem.pstr; - else - pstart = (UChar* )((void* )mem_start_stk[mem]); - - pend = (MEM_STATUS_AT(reg->bt_mem_end, mem) - ? STACK_AT(mem_end_stk[mem])->u.mem.pstr - : (UChar* )((void* )mem_end_stk[mem])); + pstart = STACK_MEM_START(reg, mem); + pend = STACK_MEM_END(reg, mem); n = (int )(pend - pstart); if (n != 0) { DATA_ENSURE(n); @@ -3689,12 +3756,10 @@ match_at(regex_t* reg, const UChar* str, const UChar* end, case OP_PUSH: case OP_REPEAT_INC: case OP_REPEAT_INC_NG: - case OP_REPEAT_INC_SG: - case OP_REPEAT_INC_NG_SG: INC_OP; break; default: - goto unexpected_bytecode_error; + MATCH_AT_ERROR_RETURN(ONIGERR_UNEXPECTED_BYTECODE); break; } #else @@ -3797,7 +3862,7 @@ match_at(regex_t* reg, const UChar* str, const UChar* end, addr = p->push_if_peek_next.addr; c = p->push_if_peek_next.c; - if (c == *s) { + if (DATA_ENSURE_CHECK1 && c == *s) { STACK_PUSH_ALT(p + addr, s, sprev); INC_OP; JUMP_OUT; @@ -3810,10 +3875,7 @@ match_at(regex_t* reg, const UChar* str, const UChar* end, mem = p->repeat.id; /* mem: OP_REPEAT ID */ addr = p->repeat.addr; - STACK_ENSURE(1); - repeat_stk[mem] = GET_STACK_INDEX(stk); - STACK_PUSH_REPEAT(mem, p + 1); - + STACK_PUSH_REPEAT_INC(mem, 0); if (reg->repeat_range[mem].lower == 0) { STACK_PUSH_ALT(p + addr, s, sprev); } @@ -3824,10 +3886,7 @@ match_at(regex_t* reg, const UChar* str, const UChar* end, mem = p->repeat.id; /* mem: OP_REPEAT ID */ addr = p->repeat.addr; - STACK_ENSURE(1); - repeat_stk[mem] = GET_STACK_INDEX(stk); - STACK_PUSH_REPEAT(mem, p + 1); - + STACK_PUSH_REPEAT_INC(mem, 0); if (reg->repeat_range[mem].lower == 0) { STACK_PUSH_ALT(p + 1, s, sprev); p += addr; @@ -3838,64 +3897,42 @@ match_at(regex_t* reg, const UChar* str, const UChar* end, CASE_OP(REPEAT_INC) mem = p->repeat_inc.id; /* mem: OP_REPEAT ID */ - si = repeat_stk[mem]; - stkp = STACK_AT(si); - - repeat_inc: - stkp->u.repeat.count++; - if (stkp->u.repeat.count >= reg->repeat_range[mem].upper) { + STACK_GET_REPEAT_COUNT(mem, n); + n++; + if (n >= reg->repeat_range[mem].upper) { /* end of repeat. Nothing to do. */ INC_OP; } - else if (stkp->u.repeat.count >= reg->repeat_range[mem].lower) { + else if (n >= reg->repeat_range[mem].lower) { INC_OP; STACK_PUSH_ALT(p, s, sprev); - p = STACK_AT(si)->u.repeat.pcode; /* Don't use stkp after PUSH. */ + p = reg->repeat_range[mem].u.pcode; } else { - p = stkp->u.repeat.pcode; + p = reg->repeat_range[mem].u.pcode; } - STACK_PUSH_REPEAT_INC(si); + STACK_PUSH_REPEAT_INC(mem, n); CHECK_INTERRUPT_JUMP_OUT; - CASE_OP(REPEAT_INC_SG) - mem = p->repeat_inc.id; /* mem: OP_REPEAT ID */ - STACK_GET_REPEAT(mem, stkp); - si = GET_STACK_INDEX(stkp); - goto repeat_inc; - CASE_OP(REPEAT_INC_NG) mem = p->repeat_inc.id; /* mem: OP_REPEAT ID */ - si = repeat_stk[mem]; - stkp = STACK_AT(si); - - repeat_inc_ng: - stkp->u.repeat.count++; - if (stkp->u.repeat.count < reg->repeat_range[mem].upper) { - if (stkp->u.repeat.count >= reg->repeat_range[mem].lower) { - Operation* pcode = stkp->u.repeat.pcode; - - STACK_PUSH_REPEAT_INC(si); - STACK_PUSH_ALT(pcode, s, sprev); + STACK_GET_REPEAT_COUNT(mem, n); + n++; + STACK_PUSH_REPEAT_INC(mem, n); + if (n == reg->repeat_range[mem].upper) { + INC_OP; + } + else { + if (n >= reg->repeat_range[mem].lower) { + STACK_PUSH_ALT(reg->repeat_range[mem].u.pcode, s, sprev); INC_OP; } else { - p = stkp->u.repeat.pcode; - STACK_PUSH_REPEAT_INC(si); + p = reg->repeat_range[mem].u.pcode; } } - else if (stkp->u.repeat.count == reg->repeat_range[mem].upper) { - STACK_PUSH_REPEAT_INC(si); - INC_OP; - } CHECK_INTERRUPT_JUMP_OUT; - CASE_OP(REPEAT_INC_NG_SG) - mem = p->repeat_inc.id; /* mem: OP_REPEAT ID */ - STACK_GET_REPEAT(mem, stkp); - si = GET_STACK_INDEX(stkp); - goto repeat_inc_ng; - CASE_OP(PREC_READ_START) STACK_PUSH_PREC_READ_START(s, sprev); INC_OP; @@ -4044,14 +4081,14 @@ match_at(regex_t* reg, const UChar* str, const UChar* end, OnigCalloutFunc func; OnigCalloutArgs args; - of = ONIG_CALLOUT_OF_NAME; - name_id = p->callout_name.id; - mem = p->callout_name.num; + of = ONIG_CALLOUT_OF_NAME; + mem = p->callout_name.num; callout_common_entry: e = onig_reg_callout_list_at(reg, mem); in = e->in; if (of == ONIG_CALLOUT_OF_NAME) { + name_id = p->callout_name.id; func = onig_get_callout_start_func(reg, mem); } else { @@ -4074,7 +4111,7 @@ match_at(regex_t* reg, const UChar* str, const UChar* end, call_result = ONIGERR_INVALID_ARGUMENT; } best_len = call_result; - goto finish; + goto match_at_end; break; } } @@ -4100,7 +4137,7 @@ match_at(regex_t* reg, const UChar* str, const UChar* end, #endif CASE_OP(FINISH) - goto finish; + goto match_at_end; #ifdef ONIG_DEBUG_STATISTICS fail: @@ -4121,37 +4158,472 @@ match_at(regex_t* reg, const UChar* str, const UChar* end, JUMP_OUT; DEFAULT_OP - goto bytecode_error; + MATCH_AT_ERROR_RETURN(ONIGERR_UNDEFINED_BYTECODE); } BYTECODE_INTERPRETER_END; - finish: + match_at_end: STACK_SAVE; return best_len; +} -#ifdef ONIG_DEBUG - stack_error: - STACK_SAVE; - return ONIGERR_STACK_BUG; -#endif +typedef struct { + regex_t* reg; + OnigRegion* region; +} RR; + +struct OnigRegSetStruct { + RR* rs; + int n; + int alloc; + OnigEncoding enc; + int anchor; /* BEGIN_BUF, BEGIN_POS, (SEMI_)END_BUF */ + OnigLen anc_dmin; /* (SEMI_)END_BUF anchor distance */ + OnigLen anc_dmax; /* (SEMI_)END_BUF anchor distance */ + int all_low_high; + int anychar_inf; +}; - bytecode_error: - STACK_SAVE; - return ONIGERR_UNDEFINED_BYTECODE; +enum SearchRangeStatus { + SRS_DEAD = 0, + SRS_LOW_HIGH = 1, + SRS_ALL_RANGE = 2 +}; -#if defined(ONIG_DEBUG) && !defined(USE_DIRECT_THREADED_CODE) - unexpected_bytecode_error: - STACK_SAVE; - return ONIGERR_UNEXPECTED_BYTECODE; -#endif +typedef struct { + int state; /* value of enum SearchRangeStatus */ + UChar* low; + UChar* high; + UChar* low_prev; + UChar* sch_range; +} SearchRange; + +#define REGSET_MATCH_AND_RETURN_CHECK(upper_range) \ + r = match_at(reg, str, end, (upper_range), s, prev, msas + i); \ + if (r != ONIG_MISMATCH) {\ + if (r >= 0) {\ + goto match;\ + }\ + else goto finish; /* error */ \ + } -#ifdef USE_RETRY_LIMIT_IN_MATCH - retry_limit_in_match_over: - STACK_SAVE; - return ONIGERR_RETRY_LIMIT_IN_MATCH_OVER; +static inline int +regset_search_body_position_lead(OnigRegSet* set, + const UChar* str, const UChar* end, + const UChar* start, const UChar* range, /* match start range */ + const UChar* orig_range, /* data range */ + OnigOptionType option, MatchArg* msas, int* rmatch_pos) +{ + int r, n, i; + UChar *s, *prev; + UChar *low, *high, *low_prev; + UChar* sch_range; + regex_t* reg; + OnigEncoding enc; + SearchRange* sr; + + n = set->n; + enc = set->enc; + + s = (UChar* )start; + if (s > str) + prev = onigenc_get_prev_char_head(enc, str, s); + else + prev = (UChar* )NULL; + + sr = (SearchRange* )xmalloc(sizeof(*sr) * n); + CHECK_NULL_RETURN_MEMERR(sr); + + for (i = 0; i < n; i++) { + reg = set->rs[i].reg; + + sr[i].state = SRS_DEAD; + if (reg->optimize != OPTIMIZE_NONE) { + if (reg->dist_max != INFINITE_LEN) { + if (end - range > reg->dist_max) + sch_range = (UChar* )range + reg->dist_max; + else + sch_range = (UChar* )end; + + if (forward_search(reg, str, end, s, sch_range, &low, &high, &low_prev)) { + sr[i].state = SRS_LOW_HIGH; + sr[i].low = low; + sr[i].high = high; + sr[i].low_prev = low_prev; + sr[i].sch_range = sch_range; + } + } + else { + sch_range = (UChar* )end; + if (forward_search(reg, str, end, s, sch_range, + &low, &high, (UChar** )NULL)) { + goto total_active; + } + } + } + else { + total_active: + sr[i].state = SRS_ALL_RANGE; + sr[i].low = s; + sr[i].high = (UChar* )range; + sr[i].low_prev = prev; + } + } + +#define ACTIVATE_ALL_LOW_HIGH_SEARCH_THRESHOLD_LEN 500 + + if (set->all_low_high != 0 + && range - start > ACTIVATE_ALL_LOW_HIGH_SEARCH_THRESHOLD_LEN) { + do { + int try_count = 0; + for (i = 0; i < n; i++) { + if (sr[i].state == SRS_DEAD) continue; + + if (s < sr[i].low) continue; + if (s >= sr[i].high) { + if (forward_search(set->rs[i].reg, str, end, s, sr[i].sch_range, + &low, &high, &low_prev) != 0) { + sr[i].low = low; + sr[i].high = high; + sr[i].low_prev = low_prev; + if (s < low) continue; + } + else { + sr[i].state = SRS_DEAD; + continue; + } + } + + reg = set->rs[i].reg; + REGSET_MATCH_AND_RETURN_CHECK(orig_range); + try_count++; + } /* for (i) */ + + if (s >= range) break; + + if (try_count == 0) { + low = (UChar* )range; + for (i = 0; i < n; i++) { + if (sr[i].state == SRS_LOW_HIGH && low > sr[i].low) { + low = sr[i].low; + low_prev = sr[i].low_prev; + } + } + if (low == range) break; + + s = low; + prev = low_prev; + } + else { + prev = s; + s += enclen(enc, s); + } + } while (1); + } + else { + int prev_is_newline = 1; + do { + for (i = 0; i < n; i++) { + if (sr[i].state == SRS_DEAD) continue; + if (sr[i].state == SRS_LOW_HIGH) { + if (s < sr[i].low) continue; + if (s >= sr[i].high) { + if (forward_search(set->rs[i].reg, str, end, s, sr[i].sch_range, + &low, &high, &low_prev) != 0) { + sr[i].low = low; + sr[i].high = high; + /* sr[i].low_prev = low_prev; */ + if (s < low) continue; + } + else { + sr[i].state = SRS_DEAD; + continue; + } + } + } + + reg = set->rs[i].reg; + if ((reg->anchor & ANCR_ANYCHAR_INF) == 0 || prev_is_newline != 0) { + REGSET_MATCH_AND_RETURN_CHECK(orig_range); + } + } + + if (s >= range) break; + + if (set->anychar_inf != 0) + prev_is_newline = ONIGENC_IS_MBC_NEWLINE(set->enc, s, end); + + prev = s; + s += enclen(enc, s); + } while (1); + } + + xfree(sr); + return ONIG_MISMATCH; + + finish: + xfree(sr); + return r; + + match: + xfree(sr); + *rmatch_pos = (int )(s - str); + return i; +} + +static inline int +regset_search_body_regex_lead(OnigRegSet* set, + const UChar* str, const UChar* end, + const UChar* start, const UChar* orig_range, OnigRegSetLead lead, + OnigOptionType option, OnigMatchParam* mps[], int* rmatch_pos) +{ + int r; + int i; + int n; + int match_index; + const UChar* ep; + regex_t* reg; + OnigRegion* region; + + n = set->n; + + match_index = ONIG_MISMATCH; + ep = orig_range; + for (i = 0; i < n; i++) { + reg = set->rs[i].reg; + region = set->rs[i].region; + r = search_in_range(reg, str, end, start, ep, orig_range, region, option, mps[i]); + if (r > 0) { + if (str + r < ep) { + match_index = i; + *rmatch_pos = r; + if (lead == ONIG_REGSET_PRIORITY_TO_REGEX_ORDER) + break; + + ep = str + r; + } + } + else if (r == 0) { + match_index = i; + *rmatch_pos = r; + break; + } + } + + return match_index; +} + +extern int +onig_regset_search_with_param(OnigRegSet* set, + const UChar* str, const UChar* end, + const UChar* start, const UChar* range, + OnigRegSetLead lead, OnigOptionType option, OnigMatchParam* mps[], + int* rmatch_pos) +{ + int r; + int i; + UChar *s, *prev; + regex_t* reg; + OnigEncoding enc; + OnigRegion* region; + MatchArg* msas; + const UChar *orig_start = start; + const UChar *orig_range = range; + + if (set->n == 0) + return ONIG_MISMATCH; + + if (IS_POSIX_REGION(option)) + return ONIGERR_INVALID_ARGUMENT; + + r = 0; + enc = set->enc; + msas = (MatchArg* )NULL; + + for (i = 0; i < set->n; i++) { + reg = set->rs[i].reg; + region = set->rs[i].region; + ADJUST_MATCH_PARAM(reg, mps[i]); + if (IS_NOT_NULL(region)) { + r = onig_region_resize_clear(region, reg->num_mem + 1); + if (r != 0) goto finish_no_msa; + } + } + + if (start > end || start < str) goto mismatch_no_msa; + if (str < end) { + /* forward search only */ + if (range <= start) + return ONIGERR_INVALID_ARGUMENT; + } + + if (ONIG_IS_OPTION_ON(option, ONIG_OPTION_CHECK_VALIDITY_OF_STRING)) { + if (! ONIGENC_IS_VALID_MBC_STRING(enc, str, end)) { + r = ONIGERR_INVALID_WIDE_CHAR_VALUE; + goto finish_no_msa; + } + } + + if (set->anchor != OPTIMIZE_NONE && str < end) { + UChar *min_semi_end, *max_semi_end; + + if ((set->anchor & ANCR_BEGIN_POSITION) != 0) { + /* search start-position only */ + begin_position: + range = start + 1; + } + else if ((set->anchor & ANCR_BEGIN_BUF) != 0) { + /* search str-position only */ + if (start != str) goto mismatch_no_msa; + range = str + 1; + } + else if ((set->anchor & ANCR_END_BUF) != 0) { + min_semi_end = max_semi_end = (UChar* )end; + + end_buf: + if ((OnigLen )(max_semi_end - str) < set->anc_dmin) + goto mismatch_no_msa; + + if ((OnigLen )(min_semi_end - start) > set->anc_dmax) { + start = min_semi_end - set->anc_dmax; + if (start < end) + start = onigenc_get_right_adjust_char_head(enc, str, start); + } + if ((OnigLen )(max_semi_end - (range - 1)) < set->anc_dmin) { + range = max_semi_end - set->anc_dmin + 1; + } + if (start > range) goto mismatch_no_msa; + } + else if ((set->anchor & ANCR_SEMI_END_BUF) != 0) { + UChar* pre_end = ONIGENC_STEP_BACK(enc, str, end, 1); + + max_semi_end = (UChar* )end; + if (ONIGENC_IS_MBC_NEWLINE(enc, pre_end, end)) { + min_semi_end = pre_end; + +#ifdef USE_CRNL_AS_LINE_TERMINATOR + pre_end = ONIGENC_STEP_BACK(enc, str, pre_end, 1); + if (IS_NOT_NULL(pre_end) && + ONIGENC_IS_MBC_CRNL(enc, pre_end, end)) { + min_semi_end = pre_end; + } #endif + if (min_semi_end > str && start <= min_semi_end) { + goto end_buf; + } + } + else { + min_semi_end = (UChar* )end; + goto end_buf; + } + } + else if ((set->anchor & ANCR_ANYCHAR_INF_ML) != 0) { + goto begin_position; + } + } + else if (str == end) { /* empty string */ + start = end = str; + s = (UChar* )start; + prev = (UChar* )NULL; + + msas = (MatchArg* )xmalloc(sizeof(*msas) * set->n); + CHECK_NULL_RETURN_MEMERR(msas); + for (i = 0; i < set->n; i++) { + reg = set->rs[i].reg; + MATCH_ARG_INIT(msas[i], reg, option, set->rs[i].region, start, mps[i]); + } + for (i = 0; i < set->n; i++) { + reg = set->rs[i].reg; + if (reg->threshold_len == 0) { + REGSET_MATCH_AND_RETURN_CHECK(end); + } + } + + goto mismatch; + } + + if (lead == ONIG_REGSET_POSITION_LEAD) { + msas = (MatchArg* )xmalloc(sizeof(*msas) * set->n); + CHECK_NULL_RETURN_MEMERR(msas); + + for (i = 0; i < set->n; i++) { + MATCH_ARG_INIT(msas[i], set->rs[i].reg, option, set->rs[i].region, + orig_start, mps[i]); + } + + r = regset_search_body_position_lead(set, str, end, start, range, + orig_range, option, msas, rmatch_pos); + } + else { + r = regset_search_body_regex_lead(set, str, end, start, orig_range, + lead, option, mps, rmatch_pos); + } + if (r < 0) goto finish; + else goto match2; + + mismatch: + r = ONIG_MISMATCH; + finish: + for (i = 0; i < set->n; i++) { + if (IS_NOT_NULL(msas)) + MATCH_ARG_FREE(msas[i]); + if (IS_FIND_NOT_EMPTY(set->rs[i].reg->options) && + IS_NOT_NULL(set->rs[i].region)) { + onig_region_clear(set->rs[i].region); + } + } + if (IS_NOT_NULL(msas)) xfree(msas); + return r; + + mismatch_no_msa: + r = ONIG_MISMATCH; + finish_no_msa: + return r; + + match: + *rmatch_pos = (int )(s - str); + match2: + for (i = 0; i < set->n; i++) { + if (IS_NOT_NULL(msas)) + MATCH_ARG_FREE(msas[i]); + if (IS_FIND_NOT_EMPTY(set->rs[i].reg->options) && + IS_NOT_NULL(set->rs[i].region)) { + onig_region_clear(set->rs[i].region); + } + } + if (IS_NOT_NULL(msas)) xfree(msas); + return r; /* regex index */ } +extern int +onig_regset_search(OnigRegSet* set, const UChar* str, const UChar* end, + const UChar* start, const UChar* range, + OnigRegSetLead lead, OnigOptionType option, int* rmatch_pos) +{ + int r; + int i; + OnigMatchParam* mp; + OnigMatchParam** mps; + + mps = (OnigMatchParam** )xmalloc((sizeof(OnigMatchParam*) + sizeof(OnigMatchParam)) * set->n); + CHECK_NULL_RETURN_MEMERR(mps); + + mp = (OnigMatchParam* )(mps + set->n); + + for (i = 0; i < set->n; i++) { + onig_initialize_match_param(mp + i); + mps[i] = mp + i; + } + + r = onig_regset_search_with_param(set, str, end, start, range, lead, option, mps, + rmatch_pos); + for (i = 0; i < set->n; i++) + onig_free_match_param_content(mp + i); + + xfree(mps); + + return r; +} static UChar* slow_search(OnigEncoding enc, UChar* target, UChar* target_end, @@ -4193,9 +4665,11 @@ str_lower_case_match(OnigEncoding enc, int case_fold_flag, UChar *q, lowbuf[ONIGENC_MBC_CASE_FOLD_MAXLEN]; while (t < tend) { + if (p >= end) return 0; lowlen = ONIGENC_MBC_CASE_FOLD(enc, case_fold_flag, &p, end, lowbuf); q = lowbuf; while (lowlen > 0) { + if (t >= tend) return 0; if (*t++ != *q++) return 0; lowlen--; } @@ -4209,16 +4683,11 @@ slow_search_ic(OnigEncoding enc, int case_fold_flag, UChar* target, UChar* target_end, const UChar* text, const UChar* text_end, UChar* text_range) { - UChar *s, *end; - - end = (UChar* )text_end; - end -= target_end - target - 1; - if (end > text_range) - end = text_range; + UChar *s; s = (UChar* )text; - while (s < end) { + while (s < text_range) { if (str_lower_case_match(enc, case_fold_flag, target, target_end, s, text_end)) return s; @@ -4372,60 +4841,6 @@ sunday_quick_search(regex_t* reg, const UChar* target, const UChar* target_end, } static UChar* -sunday_quick_search_case_fold(regex_t* reg, - const UChar* target, const UChar* target_end, - const UChar* text, const UChar* text_end, - const UChar* text_range) -{ - const UChar *s, *se, *end; - const UChar *tail; - int skip, tlen1; - int map_offset; - int case_fold_flag; - OnigEncoding enc; - -#ifdef ONIG_DEBUG_SEARCH - fprintf(stderr, - "sunday_quick_search_case_fold: text: %p, text_end: %p, text_range: %p\n", text, text_end, text_range); -#endif - - enc = reg->enc; - case_fold_flag = reg->case_fold_flag; - - tail = target_end - 1; - tlen1 = (int )(tail - target); - end = text_range; - if (end + tlen1 > text_end) - end = text_end - tlen1; - - map_offset = reg->map_offset; - s = text; - - while (s < end) { - if (str_lower_case_match(enc, case_fold_flag, target, target_end, - s, text_end)) - return (UChar* )s; - - se = s + tlen1; - if (se + map_offset >= text_end) break; - skip = reg->map[*(se + map_offset)]; -#if 0 - p = s; - do { - s += enclen(enc, s); - } while ((s - p) < skip && s < end); -#else - /* This is faster than prev code for long text. ex: /(?i)Twain/ */ - s += skip; - if (s < end) - s = onigenc_get_right_adjust_char_head(enc, text, s); -#endif - } - - return (UChar* )NULL; -} - -static UChar* map_search(OnigEncoding enc, UChar map[], const UChar* text, const UChar* text_range) { @@ -4505,25 +4920,26 @@ onig_match_with_param(regex_t* reg, const UChar* str, const UChar* end, } static int -forward_search_range(regex_t* reg, const UChar* str, const UChar* end, UChar* s, - UChar* range, UChar** low, UChar** high, UChar** low_prev) +forward_search(regex_t* reg, const UChar* str, const UChar* end, UChar* start, + UChar* range, UChar** low, UChar** high, UChar** low_prev) { UChar *p, *pprev = (UChar* )NULL; #ifdef ONIG_DEBUG_SEARCH - fprintf(stderr, "forward_search_range: str: %p, end: %p, s: %p, range: %p\n", - str, end, s, range); + fprintf(stderr, "forward_search: str: %p, end: %p, start: %p, range: %p\n", + str, end, start, range); #endif - p = s; - if (reg->dmin > 0) { + p = start; + if (reg->dist_min != 0) { + if (end - p <= reg->dist_min) + return 0; /* fail */ + if (ONIGENC_IS_SINGLEBYTE(reg->enc)) { - p += reg->dmin; + p += reg->dist_min; } else { - UChar *q = p + reg->dmin; - - if (q >= end) return 0; /* fail */ + UChar *q = p + reg->dist_min; while (p < q) p += enclen(reg->enc, p); } } @@ -4538,11 +4954,6 @@ forward_search_range(regex_t* reg, const UChar* str, const UChar* end, UChar* s, reg->exact, reg->exact_end, p, end, range); break; - case OPTIMIZE_STR_CASE_FOLD_FAST: - p = sunday_quick_search_case_fold(reg, reg->exact, reg->exact_end, p, end, - range); - break; - case OPTIMIZE_STR_FAST: p = sunday_quick_search(reg, reg->exact, reg->exact_end, p, end, range); break; @@ -4558,7 +4969,7 @@ forward_search_range(regex_t* reg, const UChar* str, const UChar* end, UChar* s, } if (p && p < range) { - if (p - reg->dmin < s) { + if (p - start < reg->dist_min) { retry_gate: pprev = p; p += enclen(reg->enc, p); @@ -4571,8 +4982,7 @@ forward_search_range(regex_t* reg, const UChar* str, const UChar* end, UChar* s, switch (reg->sub_anchor) { case ANCR_BEGIN_LINE: if (!ON_STR_BEGIN(p)) { - prev = onigenc_get_prev_char_head(reg->enc, - (pprev ? pprev : str), p); + prev = onigenc_get_prev_char_head(reg->enc, (pprev ? pprev : str), p); if (!ONIGENC_IS_MBC_NEWLINE(reg->enc, prev, end)) goto retry_gate; } @@ -4593,35 +5003,34 @@ forward_search_range(regex_t* reg, const UChar* str, const UChar* end, UChar* s, #endif ) goto retry_gate; + break; } } - if (reg->dmax == 0) { + if (reg->dist_max == 0) { *low = p; if (low_prev) { - if (*low > s) - *low_prev = onigenc_get_prev_char_head(reg->enc, s, p); + if (*low > start) + *low_prev = onigenc_get_prev_char_head(reg->enc, start, p); else *low_prev = onigenc_get_prev_char_head(reg->enc, (pprev ? pprev : str), p); } + *high = p; } else { - if (reg->dmax != INFINITE_LEN) { - if (p - str < reg->dmax) { + if (reg->dist_max != INFINITE_LEN) { + if (p - str < reg->dist_max) { *low = (UChar* )str; if (low_prev) *low_prev = onigenc_get_prev_char_head(reg->enc, str, *low); } else { - *low = p - reg->dmax; - if (*low > s) { - *low = onigenc_get_right_adjust_char_head_with_prev(reg->enc, s, + *low = p - reg->dist_max; + if (*low > start) { + *low = onigenc_get_right_adjust_char_head_with_prev(reg->enc, start, *low, (const UChar** )low_prev); - if (low_prev && IS_NULL(*low_prev)) - *low_prev = onigenc_get_prev_char_head(reg->enc, - (pprev ? pprev : s), *low); } else { if (low_prev) @@ -4630,14 +5039,18 @@ forward_search_range(regex_t* reg, const UChar* str, const UChar* end, UChar* s, } } } + /* no needs to adjust *high, *high is used as range check only */ + if (p - str < reg->dist_min) + *high = (UChar* )str; + else + *high = p - reg->dist_min; } - /* no needs to adjust *high, *high is used as range check only */ - *high = p - reg->dmin; #ifdef ONIG_DEBUG_SEARCH fprintf(stderr, - "forward_search_range success: low: %d, high: %d, dmin: %d, dmax: %d\n", - (int )(*low - str), (int )(*high - str), reg->dmin, reg->dmax); + "forward_search success: low: %d, high: %d, dmin: %u, dmax: %u\n", + (int )(*low - str), (int )(*high - str), + reg->dist_min, reg->dist_max); #endif return 1; /* success */ } @@ -4647,15 +5060,11 @@ forward_search_range(regex_t* reg, const UChar* str, const UChar* end, UChar* s, static int -backward_search_range(regex_t* reg, const UChar* str, const UChar* end, - UChar* s, const UChar* range, UChar* adjrange, - UChar** low, UChar** high) +backward_search(regex_t* reg, const UChar* str, const UChar* end, UChar* s, + const UChar* range, UChar* adjrange, UChar** low, UChar** high) { UChar *p; - if (range == 0) goto fail; - - range += reg->dmin; p = s; retry: @@ -4667,7 +5076,6 @@ backward_search_range(regex_t* reg, const UChar* str, const UChar* end, break; case OPTIMIZE_STR_CASE_FOLD: - case OPTIMIZE_STR_CASE_FOLD_FAST: p = slow_search_backward_ic(reg->enc, reg->case_fold_flag, reg->exact, reg->exact_end, range, adjrange, end, p); @@ -4722,15 +5130,27 @@ backward_search_range(regex_t* reg, const UChar* str, const UChar* end, } } - /* no needs to adjust *high, *high is used as range check only */ - if (reg->dmax != INFINITE_LEN) { - *low = p - reg->dmax; - *high = p - reg->dmin; + if (reg->dist_max != INFINITE_LEN) { + if (p - str < reg->dist_max) + *low = (UChar* )str; + else + *low = p - reg->dist_max; + + if (reg->dist_min != 0) { + if (p - str < reg->dist_min) + *high = (UChar* )str; + else + *high = p - reg->dist_min; + } + else { + *high = p; + } + *high = onigenc_get_right_adjust_char_head(reg->enc, adjrange, *high); } #ifdef ONIG_DEBUG_SEARCH - fprintf(stderr, "backward_search_range: low: %d, high: %d\n", + fprintf(stderr, "backward_search: low: %d, high: %d\n", (int )(*low - str), (int )(*high - str)); #endif return 1; /* success */ @@ -4738,7 +5158,7 @@ backward_search_range(regex_t* reg, const UChar* str, const UChar* end, fail: #ifdef ONIG_DEBUG_SEARCH - fprintf(stderr, "backward_search_range: fail.\n"); + fprintf(stderr, "backward_search: fail.\n"); #endif return 0; /* fail */ } @@ -4751,24 +5171,35 @@ onig_search(regex_t* reg, const UChar* str, const UChar* end, { int r; OnigMatchParam mp; + const UChar* data_range; onig_initialize_match_param(&mp); - r = onig_search_with_param(reg, str, end, start, range, region, option, &mp); + + /* The following is an expanded code of onig_search_with_param() */ + if (range > start) + data_range = range; + else + data_range = end; + + r = search_in_range(reg, str, end, start, range, data_range, region, + option, &mp); + onig_free_match_param_content(&mp); return r; } -extern int -onig_search_with_param(regex_t* reg, const UChar* str, const UChar* end, - const UChar* start, const UChar* range, OnigRegion* region, - OnigOptionType option, OnigMatchParam* mp) +static int +search_in_range(regex_t* reg, const UChar* str, const UChar* end, + const UChar* start, const UChar* range, /* match start range */ + const UChar* data_range, /* subject string range */ + OnigRegion* region, + OnigOptionType option, OnigMatchParam* mp) { int r; UChar *s, *prev; MatchArg msa; const UChar *orig_start = start; - const UChar *orig_range = range; #ifdef ONIG_DEBUG_SEARCH fprintf(stderr, @@ -4851,17 +5282,21 @@ onig_search_with_param(regex_t* reg, const UChar* str, const UChar* end, min_semi_end = max_semi_end = (UChar* )end; end_buf: - if ((OnigLen )(max_semi_end - str) < reg->anchor_dmin) + if ((OnigLen )(max_semi_end - str) < reg->anc_dist_min) goto mismatch_no_msa; if (range > start) { - if ((OnigLen )(min_semi_end - start) > reg->anchor_dmax) { - start = min_semi_end - reg->anchor_dmax; + if (reg->anc_dist_max != INFINITE_LEN && + min_semi_end - start > reg->anc_dist_max) { + start = min_semi_end - reg->anc_dist_max; if (start < end) start = onigenc_get_right_adjust_char_head(reg->enc, str, start); } - if ((OnigLen )(max_semi_end - (range - 1)) < reg->anchor_dmin) { - range = max_semi_end - reg->anchor_dmin + 1; + if (max_semi_end - (range - 1) < reg->anc_dist_min) { + if (max_semi_end - str + 1 < reg->anc_dist_min) + goto mismatch_no_msa; + else + range = max_semi_end - reg->anc_dist_min + 1; } if (start > range) goto mismatch_no_msa; @@ -4869,12 +5304,17 @@ onig_search_with_param(regex_t* reg, const UChar* str, const UChar* end, Backward search is used. */ } else { - if ((OnigLen )(min_semi_end - range) > reg->anchor_dmax) { - range = min_semi_end - reg->anchor_dmax; + if (reg->anc_dist_max != INFINITE_LEN && + min_semi_end - range > reg->anc_dist_max) { + range = min_semi_end - reg->anc_dist_max; } - if ((OnigLen )(max_semi_end - start) < reg->anchor_dmin) { - start = max_semi_end - reg->anchor_dmin; - start = ONIGENC_LEFT_ADJUST_CHAR_HEAD(reg->enc, str, start); + if (max_semi_end - start < reg->anc_dist_min) { + if (max_semi_end - str < reg->anc_dist_min) + goto mismatch_no_msa; + else { + start = max_semi_end - reg->anc_dist_min; + start = ONIGENC_LEFT_ADJUST_CHAR_HEAD(reg->enc, str, start); + } } if (range > start) goto mismatch_no_msa; } @@ -4942,29 +5382,33 @@ onig_search_with_param(regex_t* reg, const UChar* str, const UChar* end, if (reg->optimize != OPTIMIZE_NONE) { UChar *sch_range, *low, *high, *low_prev; - sch_range = (UChar* )range; - if (reg->dmax != 0) { - if (reg->dmax == INFINITE_LEN) + if (reg->dist_max != 0) { + if (reg->dist_max == INFINITE_LEN) sch_range = (UChar* )end; else { - sch_range += reg->dmax; - if (sch_range > end) sch_range = (UChar* )end; + if ((end - range) < reg->dist_max) + sch_range = (UChar* )end; + else { + sch_range = (UChar* )range + reg->dist_max; + } } } + else + sch_range = (UChar* )range; if ((end - start) < reg->threshold_len) goto mismatch; - if (reg->dmax != INFINITE_LEN) { + if (reg->dist_max != INFINITE_LEN) { do { - if (! forward_search_range(reg, str, end, s, sch_range, - &low, &high, &low_prev)) goto mismatch; + if (! forward_search(reg, str, end, s, sch_range, &low, &high, + &low_prev)) goto mismatch; if (s < low) { s = low; prev = low_prev; } while (s <= high) { - MATCH_AND_RETURN_CHECK(orig_range); + MATCH_AND_RETURN_CHECK(data_range); prev = s; s += enclen(reg->enc, s); } @@ -4972,12 +5416,12 @@ onig_search_with_param(regex_t* reg, const UChar* str, const UChar* end, goto mismatch; } else { /* check only. */ - if (! forward_search_range(reg, str, end, s, sch_range, - &low, &high, (UChar** )NULL)) goto mismatch; + if (! forward_search(reg, str, end, s, sch_range, &low, &high, + (UChar** )NULL)) goto mismatch; if ((reg->anchor & ANCR_ANYCHAR_INF) != 0) { do { - MATCH_AND_RETURN_CHECK(orig_range); + MATCH_AND_RETURN_CHECK(data_range); prev = s; s += enclen(reg->enc, s); @@ -4994,13 +5438,13 @@ onig_search_with_param(regex_t* reg, const UChar* str, const UChar* end, } do { - MATCH_AND_RETURN_CHECK(orig_range); + MATCH_AND_RETURN_CHECK(data_range); prev = s; s += enclen(reg->enc, s); } while (s < range); if (s == range) { /* because empty match with /$/. */ - MATCH_AND_RETURN_CHECK(orig_range); + MATCH_AND_RETURN_CHECK(data_range); } } else { /* backward search */ @@ -5011,19 +5455,30 @@ onig_search_with_param(regex_t* reg, const UChar* str, const UChar* end, if (reg->optimize != OPTIMIZE_NONE) { UChar *low, *high, *adjrange, *sch_start; + const UChar *min_range; + + if ((end - range) < reg->threshold_len) goto mismatch; if (range < end) adjrange = ONIGENC_LEFT_ADJUST_CHAR_HEAD(reg->enc, str, range); else adjrange = (UChar* )end; - if (reg->dmax != INFINITE_LEN && - (end - range) >= reg->threshold_len) { + if (end - range > reg->dist_min) + min_range = range + reg->dist_min; + else + min_range = end; + + if (reg->dist_max != INFINITE_LEN) { do { - sch_start = s + reg->dmax; - if (sch_start > end) sch_start = (UChar* )end; - if (backward_search_range(reg, str, end, sch_start, range, adjrange, - &low, &high) <= 0) + if (end - s > reg->dist_max) + sch_start = s + reg->dist_max; + else { + sch_start = onigenc_get_prev_char_head(reg->enc, str, end); + } + + if (backward_search(reg, str, end, sch_start, min_range, adjrange, + &low, &high) <= 0) goto mismatch; if (s > high) @@ -5038,22 +5493,10 @@ onig_search_with_param(regex_t* reg, const UChar* str, const UChar* end, goto mismatch; } else { /* check only. */ - if ((end - range) < reg->threshold_len) goto mismatch; + sch_start = onigenc_get_prev_char_head(reg->enc, str, end); - sch_start = s; - if (reg->dmax != 0) { - if (reg->dmax == INFINITE_LEN) - sch_start = (UChar* )end; - else { - sch_start += reg->dmax; - if (sch_start > end) sch_start = (UChar* )end; - else - sch_start = ONIGENC_LEFT_ADJUST_CHAR_HEAD(reg->enc, - start, sch_start); - } - } - if (backward_search_range(reg, str, end, sch_start, range, adjrange, - &low, &high) <= 0) goto mismatch; + if (backward_search(reg, str, end, sch_start, min_range, adjrange, + &low, &high) <= 0) goto mismatch; } } @@ -5109,6 +5552,22 @@ onig_search_with_param(regex_t* reg, const UChar* str, const UChar* end, } extern int +onig_search_with_param(regex_t* reg, const UChar* str, const UChar* end, + const UChar* start, const UChar* range, OnigRegion* region, + OnigOptionType option, OnigMatchParam* mp) +{ + const UChar* data_range; + + if (range > start) + data_range = range; + else + data_range = end; + + return search_in_range(reg, str, end, start, range, data_range, region, + option, mp); +} + +extern int onig_scan(regex_t* reg, const UChar* str, const UChar* end, OnigRegion* region, OnigOptionType option, int (*scan_callback)(int, int, OnigRegion*, void*), @@ -5210,6 +5669,202 @@ onig_copy_encoding(OnigEncoding to, OnigEncoding from) *to = *from; } +extern int +onig_regset_new(OnigRegSet** rset, int n, regex_t* regs[]) +{ +#define REGSET_INITIAL_ALLOC_SIZE 10 + + int i; + int r; + int alloc; + OnigRegSet* set; + RR* rs; + + *rset = 0; + + set = (OnigRegSet* )xmalloc(sizeof(*set)); + CHECK_NULL_RETURN_MEMERR(set); + + alloc = n > REGSET_INITIAL_ALLOC_SIZE ? n : REGSET_INITIAL_ALLOC_SIZE; + rs = (RR* )xmalloc(sizeof(set->rs[0]) * alloc); + if (IS_NULL(rs)) { + xfree(set); + return ONIGERR_MEMORY; + } + + set->rs = rs; + set->n = 0; + set->alloc = alloc; + + for (i = 0; i < n; i++) { + regex_t* reg = regs[i]; + + r = onig_regset_add(set, reg); + if (r != 0) { + for (i = 0; i < set->n; i++) { + OnigRegion* region = set->rs[i].region; + if (IS_NOT_NULL(region)) + onig_region_free(region, 1); + } + xfree(set->rs); + xfree(set); + return r; + } + } + + *rset = set; + return 0; +} + +static void +update_regset_by_reg(OnigRegSet* set, regex_t* reg) +{ + if (set->n == 1) { + set->enc = reg->enc; + set->anchor = reg->anchor; + set->anc_dmin = reg->anc_dist_min; + set->anc_dmax = reg->anc_dist_max; + set->all_low_high = + (reg->optimize == OPTIMIZE_NONE || reg->dist_max == INFINITE_LEN) ? 0 : 1; + set->anychar_inf = (reg->anchor & ANCR_ANYCHAR_INF) != 0 ? 1 : 0; + } + else { + int anchor; + + anchor = set->anchor & reg->anchor; + if (anchor != 0) { + OnigLen anc_dmin; + OnigLen anc_dmax; + + anc_dmin = set->anc_dmin; + anc_dmax = set->anc_dmax; + if (anc_dmin > reg->anc_dist_min) anc_dmin = reg->anc_dist_min; + if (anc_dmax < reg->anc_dist_max) anc_dmax = reg->anc_dist_max; + set->anc_dmin = anc_dmin; + set->anc_dmax = anc_dmax; + } + + set->anchor = anchor; + + if (reg->optimize == OPTIMIZE_NONE || reg->dist_max == INFINITE_LEN) + set->all_low_high = 0; + + if ((reg->anchor & ANCR_ANYCHAR_INF) != 0) + set->anychar_inf = 1; + } +} + +extern int +onig_regset_add(OnigRegSet* set, regex_t* reg) +{ + OnigRegion* region; + + if (IS_FIND_LONGEST(reg->options)) + return ONIGERR_INVALID_ARGUMENT; + + if (set->n != 0 && reg->enc != set->enc) + return ONIGERR_INVALID_ARGUMENT; + + if (set->n >= set->alloc) { + RR* nrs; + int new_alloc; + + new_alloc = set->alloc * 2; + nrs = (RR* )xrealloc(set->rs, sizeof(set->rs[0]) * new_alloc); + CHECK_NULL_RETURN_MEMERR(nrs); + + set->rs = nrs; + set->alloc = new_alloc; + } + + region = onig_region_new(); + CHECK_NULL_RETURN_MEMERR(region); + + set->rs[set->n].reg = reg; + set->rs[set->n].region = region; + set->n++; + + update_regset_by_reg(set, reg); + return 0; +} + +extern int +onig_regset_replace(OnigRegSet* set, int at, regex_t* reg) +{ + int i; + + if (at < 0 || at >= set->n) + return ONIGERR_INVALID_ARGUMENT; + + if (IS_NULL(reg)) { + onig_region_free(set->rs[at].region, 1); + for (i = at; i < set->n - 1; i++) { + set->rs[i].reg = set->rs[i+1].reg; + set->rs[i].region = set->rs[i+1].region; + } + set->n--; + } + else { + if (IS_FIND_LONGEST(reg->options)) + return ONIGERR_INVALID_ARGUMENT; + + if (set->n > 1 && reg->enc != set->enc) + return ONIGERR_INVALID_ARGUMENT; + + set->rs[at].reg = reg; + } + + for (i = 0; i < set->n; i++) + update_regset_by_reg(set, set->rs[i].reg); + + return 0; +} + +extern void +onig_regset_free(OnigRegSet* set) +{ + int i; + + for (i = 0; i < set->n; i++) { + regex_t* reg; + OnigRegion* region; + + reg = set->rs[i].reg; + region = set->rs[i].region; + onig_free(reg); + if (IS_NOT_NULL(region)) + onig_region_free(region, 1); + } + + xfree(set->rs); + xfree(set); +} + +extern int +onig_regset_number_of_regex(OnigRegSet* set) +{ + return set->n; +} + +extern regex_t* +onig_regset_get_regex(OnigRegSet* set, int at) +{ + if (at < 0 || at >= set->n) + return (regex_t* )0; + + return set->rs[at].reg; +} + +extern OnigRegion* +onig_regset_get_region(OnigRegSet* set, int at) +{ + if (at < 0 || at >= set->n) + return (OnigRegion* )0; + + return set->rs[at].region; +} + + #ifdef USE_DIRECT_THREADED_CODE extern int onig_init_for_match_at(regex_t* reg) @@ -5402,35 +6057,25 @@ onig_get_capture_range_in_callout(OnigCalloutArgs* a, int mem_num, int* begin, i const UChar* str; StackType* stk_base; int i; + StackIndex* mem_start_stk; + StackIndex* mem_end_stk; i = mem_num; reg = a->regex; str = a->string; stk_base = a->stk_base; + mem_start_stk = a->mem_start_stk; + mem_end_stk = a->mem_end_stk; if (i > 0) { if (a->mem_end_stk[i] != INVALID_STACK_INDEX) { - if (MEM_STATUS_AT(reg->bt_mem_start, i)) - *begin = (int )(STACK_AT(a->mem_start_stk[i])->u.mem.pstr - str); - else - *begin = (int )((UChar* )((void* )a->mem_start_stk[i]) - str); - - *end = (int )((MEM_STATUS_AT(reg->bt_mem_end, i) - ? STACK_AT(a->mem_end_stk[i])->u.mem.pstr - : (UChar* )((void* )a->mem_end_stk[i])) - str); + *begin = (int )(STACK_MEM_START(reg, i) - str); + *end = (int )(STACK_MEM_END(reg, i) - str); } else { *begin = *end = ONIG_REGION_NOTPOS; } } - else if (i == 0) { -#if 0 - *begin = a->start - str; - *end = a->current - str; -#else - return ONIGERR_INVALID_ARGUMENT; -#endif - } else return ONIGERR_INVALID_ARGUMENT; @@ -5468,14 +6113,6 @@ onig_builtin_mismatch(OnigCalloutArgs* args ARG_UNUSED, void* user_data ARG_UNUS return ONIG_MISMATCH; } -#if 0 -extern int -onig_builtin_success(OnigCalloutArgs* args ARG_UNUSED, void* user_data ARG_UNUSED) -{ - return ONIG_CALLOUT_SUCCESS; -} -#endif - extern int onig_builtin_error(OnigCalloutArgs* args, void* user_data ARG_UNUSED) { diff --git a/src/regext.c b/src/regext.c index 965c793..c46f630 100644 --- a/src/regext.c +++ b/src/regext.c @@ -2,7 +2,7 @@ regext.c - Oniguruma (regular expression library) **********************************************************************/ /*- - * Copyright (c) 2002-2019 K.Kosako <sndgk393 AT ybb DOT ne DOT jp> + * Copyright (c) 2002-2019 K.Kosako * All rights reserved. * * Redistribution and use in source and binary forms, with or without diff --git a/src/reggnu.c b/src/reggnu.c index a124ae8..8a45078 100644 --- a/src/reggnu.c +++ b/src/reggnu.c @@ -2,7 +2,7 @@ reggnu.c - Oniguruma (regular expression library) **********************************************************************/ /*- - * Copyright (c) 2002-2019 K.Kosako <sndgk393 AT ybb DOT ne DOT jp> + * Copyright (c) 2002-2019 K.Kosako * All rights reserved. * * Redistribution and use in source and binary forms, with or without diff --git a/src/regint.h b/src/regint.h index 38389a1..cc540da 100644 --- a/src/regint.h +++ b/src/regint.h @@ -4,7 +4,7 @@ regint.h - Oniguruma (regular expression library) **********************************************************************/ /*- - * Copyright (c) 2002-2019 K.Kosako <sndgk393 AT ybb DOT ne DOT jp> + * Copyright (c) 2002-2019 K.Kosako * All rights reserved. * * Redistribution and use in source and binary forms, with or without @@ -47,16 +47,11 @@ #endif #endif -#if defined(__i386) || defined(__i386__) || defined(_M_IX86) || \ - (defined(__ppc__) && defined(__APPLE__)) || \ - defined(__x86_64) || defined(__x86_64__) || \ - defined(__mc68020__) -#define PLATFORM_UNALIGNED_WORD_ACCESS -#endif - +#ifndef ONIG_DISABLE_DIRECT_THREADING #ifdef __GNUC__ #define USE_GOTO_LABELS_AS_VALUES #endif +#endif /* config */ /* spec. config */ @@ -82,6 +77,8 @@ #define USE_VARIABLE_META_CHARS #define USE_POSIX_API_REGION_OPTION #define USE_FIND_LONGEST_SEARCH_ALL_OF_RANGE +/* #define USE_REPEAT_AND_EMPTY_CHECK_LOCAL_VAR */ + #include "regenc.h" @@ -197,49 +194,16 @@ typedef unsigned int uintptr_t; #define CHAR_MAP_SIZE 256 #define INFINITE_LEN ONIG_INFINITE_DISTANCE -#ifdef PLATFORM_UNALIGNED_WORD_ACCESS - -#define PLATFORM_GET_INC(val,p,type) do{\ - val = *(type* )p;\ - (p) += sizeof(type);\ -} while(0) - -#else - -#define PLATFORM_GET_INC(val,p,type) do{\ - xmemcpy(&val, (p), sizeof(type));\ - (p) += sizeof(type);\ -} while(0) - -/* sizeof(OnigCodePoint) */ -#ifdef SIZEOF_SIZE_T -# define WORD_ALIGNMENT_SIZE SIZEOF_SIZE_T -#else -# define WORD_ALIGNMENT_SIZE SIZEOF_LONG -#endif - -#define GET_ALIGNMENT_PAD_SIZE(addr,pad_size) do {\ - (pad_size) = WORD_ALIGNMENT_SIZE - ((uintptr_t )(addr) % WORD_ALIGNMENT_SIZE);\ - if ((pad_size) == WORD_ALIGNMENT_SIZE) (pad_size) = 0;\ -} while (0) - -#define ALIGNMENT_RIGHT(addr) do {\ - (addr) += (WORD_ALIGNMENT_SIZE - 1);\ - (addr) -= ((uintptr_t )(addr) % WORD_ALIGNMENT_SIZE);\ -} while (0) - -#endif /* PLATFORM_UNALIGNED_WORD_ACCESS */ - #ifdef USE_CALLOUT typedef struct { - int flag; - OnigCalloutOf of; - int in; - int name_id; - const UChar* tag_start; - const UChar* tag_end; + int flag; + OnigCalloutOf of; + int in; + int name_id; + const UChar* tag_start; + const UChar* tag_end; OnigCalloutType type; OnigCalloutFunc start_func; OnigCalloutFunc end_func; @@ -272,7 +236,6 @@ enum OptimizeType { OPTIMIZE_STR, /* Slow Search */ OPTIMIZE_STR_FAST, /* Sunday quick search / BMH */ OPTIMIZE_STR_FAST_STEP_FORWARD, /* Sunday quick search / BMH */ - OPTIMIZE_STR_CASE_FOLD_FAST, /* Sunday quick search / BMH (ignore case) */ OPTIMIZE_STR_CASE_FOLD, /* Slow Search (ignore case) */ OPTIMIZE_MAP /* char map */ }; @@ -288,6 +251,8 @@ typedef unsigned int MemStatusType; #define MEM_STATUS_AT0(stats,n) \ ((n) > 0 && (n) < (int )MEM_STATUS_BITS_NUM ? ((stats) & ((MemStatusType )1 << n)) : ((stats) & 1)) +#define MEM_STATUS_IS_ALL_ON(stats) (((stats) & 1) != 0) + #define MEM_STATUS_ON(stats,n) do {\ if ((n) < (int )MEM_STATUS_BITS_NUM) {\ if ((n) != 0)\ @@ -302,8 +267,14 @@ typedef unsigned int MemStatusType; (stats) |= ((MemStatusType )1 << (n));\ } while (0) +#define MEM_STATUS_LIMIT_AT(stats,n) \ + ((n) < (int )MEM_STATUS_BITS_NUM ? ((stats) & ((MemStatusType )1 << n)) : 0) +#define MEM_STATUS_LIMIT_ON(stats,n) do {\ + if ((n) < (int )MEM_STATUS_BITS_NUM && (n) != 0) {\ + (stats) |= ((MemStatusType )1 << (n));\ + }\ +} while (0) -#define INT_MAX_LIMIT ((1UL << (SIZEOF_INT * 8 - 1)) - 1) #define IS_CODE_WORD_ASCII(enc,code) \ (ONIGENC_IS_CODE_ASCII(code) && ONIGENC_IS_CODE_WORD(enc,code)) @@ -354,16 +325,12 @@ typedef unsigned int MemStatusType; /* bitset */ #define BITS_PER_BYTE 8 #define SINGLE_BYTE_SIZE (1 << BITS_PER_BYTE) -#define BITS_IN_ROOM (sizeof(Bits) * BITS_PER_BYTE) +#define BITS_IN_ROOM 32 /* 4 * BITS_PER_BYTE */ #define BITSET_SIZE (SINGLE_BYTE_SIZE / BITS_IN_ROOM) -#ifdef PLATFORM_UNALIGNED_WORD_ACCESS -typedef unsigned int Bits; -#else -typedef unsigned char Bits; -#endif -typedef Bits BitSet[BITSET_SIZE]; -typedef Bits* BitSetRef; +typedef uint32_t Bits; +typedef Bits BitSet[BITSET_SIZE]; +typedef Bits* BitSetRef; #define SIZE_BITSET sizeof(BitSet) @@ -372,8 +339,8 @@ typedef Bits* BitSetRef; for (i = 0; i < (int )BITSET_SIZE; i++) { (bs)[i] = 0; } \ } while (0) -#define BS_ROOM(bs,pos) (bs)[pos / BITS_IN_ROOM] -#define BS_BIT(pos) (1 << (pos % BITS_IN_ROOM)) +#define BS_ROOM(bs,pos) (bs)[(unsigned int )(pos) >> 5] +#define BS_BIT(pos) (1u << ((unsigned int )(pos) & 0x1f)) #define BITSET_AT(bs, pos) (BS_ROOM(bs,pos) & BS_BIT(pos)) #define BITSET_SET_BIT(bs, pos) BS_ROOM(bs,pos) |= BS_BIT(pos) @@ -389,11 +356,13 @@ typedef struct _BBuf { #define BB_INIT(buf,size) bbuf_init((BBuf* )(buf), (size)) +/* #define BB_SIZE_INC(buf,inc) do{\ (buf)->alloc += (inc);\ (buf)->p = (UChar* )xrealloc((buf)->p, (buf)->alloc);\ if (IS_NULL((buf)->p)) return(ONIGERR_MEMORY);\ } while (0) +*/ #define BB_EXPAND(buf,low) do{\ do { (buf)->alloc *= 2; } while ((buf)->alloc < (unsigned int )low);\ @@ -491,39 +460,34 @@ typedef struct _BBuf { /* operation code */ enum OpCode { - OP_FINISH = 0, /* matching process terminator (no more alternative) */ - OP_END = 1, /* pattern code terminator (success end) */ - - OP_EXACT1 = 2, /* single byte, N = 1 */ - OP_EXACT2, /* single byte, N = 2 */ - OP_EXACT3, /* single byte, N = 3 */ - OP_EXACT4, /* single byte, N = 4 */ - OP_EXACT5, /* single byte, N = 5 */ - OP_EXACTN, /* single byte */ - OP_EXACTMB2N1, /* mb-length = 2 N = 1 */ - OP_EXACTMB2N2, /* mb-length = 2 N = 2 */ - OP_EXACTMB2N3, /* mb-length = 2 N = 3 */ - OP_EXACTMB2N, /* mb-length = 2 */ - OP_EXACTMB3N, /* mb-length = 3 */ - OP_EXACTMBN, /* other length */ - - OP_EXACT1_IC, /* single byte, N = 1, ignore case */ - OP_EXACTN_IC, /* single byte, ignore case */ - + OP_FINISH = 0, /* matching process terminator (no more alternative) */ + OP_END = 1, /* pattern code terminator (success end) */ + OP_STR_1 = 2, /* single byte, N = 1 */ + OP_STR_2, /* single byte, N = 2 */ + OP_STR_3, /* single byte, N = 3 */ + OP_STR_4, /* single byte, N = 4 */ + OP_STR_5, /* single byte, N = 5 */ + OP_STR_N, /* single byte */ + OP_STR_MB2N1, /* mb-length = 2 N = 1 */ + OP_STR_MB2N2, /* mb-length = 2 N = 2 */ + OP_STR_MB2N3, /* mb-length = 2 N = 3 */ + OP_STR_MB2N, /* mb-length = 2 */ + OP_STR_MB3N, /* mb-length = 3 */ + OP_STR_MBN, /* other length */ + OP_STR_1_IC, /* single byte, N = 1, ignore case */ + OP_STR_N_IC, /* single byte, ignore case */ OP_CCLASS, OP_CCLASS_MB, OP_CCLASS_MIX, OP_CCLASS_NOT, OP_CCLASS_MB_NOT, OP_CCLASS_MIX_NOT, - OP_ANYCHAR, /* "." */ OP_ANYCHAR_ML, /* "." multi-line */ OP_ANYCHAR_STAR, /* ".*" */ OP_ANYCHAR_ML_STAR, /* ".*" multi-line */ OP_ANYCHAR_STAR_PEEK_NEXT, OP_ANYCHAR_ML_STAR_PEEK_NEXT, - OP_WORD, OP_WORD_ASCII, OP_NO_WORD, @@ -532,16 +496,13 @@ enum OpCode { OP_NO_WORD_BOUNDARY, OP_WORD_BEGIN, OP_WORD_END, - OP_TEXT_SEGMENT_BOUNDARY, - OP_BEGIN_BUF, OP_END_BUF, OP_BEGIN_LINE, OP_END_LINE, OP_SEMI_END_BUF, OP_BEGIN_POSITION, - OP_BACKREF1, OP_BACKREF2, OP_BACKREF_N, @@ -552,34 +513,35 @@ enum OpCode { OP_BACKREF_WITH_LEVEL_IC, /* \k<xxx+n>, \k<xxx-n> */ OP_BACKREF_CHECK, /* (?(n)), (?('name')) */ OP_BACKREF_CHECK_WITH_LEVEL, /* (?(n-level)), (?('name-level')) */ - - OP_MEMORY_START, - OP_MEMORY_START_PUSH, /* push back-tracker to stack */ - OP_MEMORY_END_PUSH, /* push back-tracker to stack */ - OP_MEMORY_END_PUSH_REC, /* push back-tracker to stack */ - OP_MEMORY_END, - OP_MEMORY_END_REC, /* push marker to stack */ - + OP_MEM_START, + OP_MEM_START_PUSH, /* push back-tracker to stack */ + OP_MEM_END_PUSH, /* push back-tracker to stack */ +#ifdef USE_CALL + OP_MEM_END_PUSH_REC, /* push back-tracker to stack */ +#endif + OP_MEM_END, +#ifdef USE_CALL + OP_MEM_END_REC, /* push marker to stack */ +#endif OP_FAIL, /* pop stack and move */ OP_JUMP, OP_PUSH, OP_PUSH_SUPER, OP_POP_OUT, #ifdef USE_OP_PUSH_OR_JUMP_EXACT - OP_PUSH_OR_JUMP_EXACT1, /* if match exact then push, else jump. */ + OP_PUSH_OR_JUMP_EXACT1, /* if match exact then push, else jump. */ #endif - OP_PUSH_IF_PEEK_NEXT, /* if match exact then push, else none. */ - OP_REPEAT, /* {n,m} */ - OP_REPEAT_NG, /* {n,m}? (non greedy) */ + OP_PUSH_IF_PEEK_NEXT, /* if match exact then push, else none. */ + OP_REPEAT, /* {n,m} */ + OP_REPEAT_NG, /* {n,m}? (non greedy) */ OP_REPEAT_INC, - OP_REPEAT_INC_NG, /* non greedy */ - OP_REPEAT_INC_SG, /* search and get in stack */ - OP_REPEAT_INC_NG_SG, /* search and get in stack (non greedy) */ + OP_REPEAT_INC_NG, /* non greedy */ OP_EMPTY_CHECK_START, /* null loop checker start */ OP_EMPTY_CHECK_END, /* null loop checker end */ OP_EMPTY_CHECK_END_MEMST, /* null loop checker end (with capture status) */ +#ifdef USE_CALL OP_EMPTY_CHECK_END_MEMST_PUSH, /* with capture status and push check-end */ - +#endif OP_PREC_READ_START, /* (?=...) start */ OP_PREC_READ_END, /* (?=...) end */ OP_PREC_READ_NOT_START, /* (?!...) start */ @@ -589,11 +551,12 @@ enum OpCode { OP_LOOK_BEHIND, /* (?<=...) start (no needs end opcode) */ OP_LOOK_BEHIND_NOT_START, /* (?<!...) start */ OP_LOOK_BEHIND_NOT_END, /* (?<!...) end */ - - OP_CALL, /* \g<name> */ - OP_RETURN, OP_PUSH_SAVE_VAL, OP_UPDATE_VAR, +#ifdef USE_CALL + OP_CALL, /* \g<name> */ + OP_RETURN, +#endif #ifdef USE_CALLOUT OP_CALLOUT_CONTENTS, /* (?{...}) (?{{...}}) */ OP_CALLOUT_NAME, /* (*name) (*name[tag](args...)) */ @@ -601,8 +564,8 @@ enum OpCode { }; enum SaveType { - SAVE_KEEP = 0, /* SAVE S */ - SAVE_S = 1, + SAVE_KEEP = 0, /* SAVE S */ + SAVE_S = 1, SAVE_RIGHT_RANGE = 2, }; @@ -642,116 +605,57 @@ typedef int ModeType; #define SIZE_UPDATE_VAR_TYPE sizeof(UpdateVarType) #define SIZE_MODE sizeof(ModeType) -#define GET_RELADDR_INC(addr,p) PLATFORM_GET_INC(addr, p, RelAddrType) -#define GET_ABSADDR_INC(addr,p) PLATFORM_GET_INC(addr, p, AbsAddrType) -#define GET_LENGTH_INC(len,p) PLATFORM_GET_INC(len, p, LengthType) -#define GET_MEMNUM_INC(num,p) PLATFORM_GET_INC(num, p, MemNumType) -#define GET_REPEATNUM_INC(num,p) PLATFORM_GET_INC(num, p, RepeatNumType) -#define GET_OPTION_INC(option,p) PLATFORM_GET_INC(option, p, OnigOptionType) -#define GET_POINTER_INC(ptr,p) PLATFORM_GET_INC(ptr, p, PointerType) -#define GET_SAVE_TYPE_INC(type,p) PLATFORM_GET_INC(type, p, SaveType) -#define GET_UPDATE_VAR_TYPE_INC(type,p) PLATFORM_GET_INC(type, p, UpdateVarType) -#define GET_MODE_INC(mode,p) PLATFORM_GET_INC(mode, p, ModeType) - /* code point's address must be aligned address. */ #define GET_CODE_POINT(code,p) code = *((OnigCodePoint* )(p)) -#define GET_BYTE_INC(byte,p) do{\ - byte = *(p);\ - (p)++;\ -} while(0) /* op-code + arg size */ -#if 0 -#define SIZE_OP_ANYCHAR_STAR SIZE_OPCODE -#define SIZE_OP_ANYCHAR_STAR_PEEK_NEXT (SIZE_OPCODE + 1) -#define SIZE_OP_JUMP (SIZE_OPCODE + SIZE_RELADDR) -#define SIZE_OP_PUSH (SIZE_OPCODE + SIZE_RELADDR) -#define SIZE_OP_PUSH_SUPER (SIZE_OPCODE + SIZE_RELADDR) -#define SIZE_OP_POP_OUT SIZE_OPCODE -#ifdef USE_OP_PUSH_OR_JUMP_EXACT -#define SIZE_OP_PUSH_OR_JUMP_EXACT1 (SIZE_OPCODE + SIZE_RELADDR + 1) -#endif -#define SIZE_OP_PUSH_IF_PEEK_NEXT (SIZE_OPCODE + SIZE_RELADDR + 1) -#define SIZE_OP_REPEAT_INC (SIZE_OPCODE + SIZE_MEMNUM) -#define SIZE_OP_REPEAT_INC_NG (SIZE_OPCODE + SIZE_MEMNUM) -#define SIZE_OP_WORD_BOUNDARY (SIZE_OPCODE + SIZE_MODE) -#define SIZE_OP_PREC_READ_START SIZE_OPCODE -#define SIZE_OP_PREC_READ_NOT_START (SIZE_OPCODE + SIZE_RELADDR) -#define SIZE_OP_PREC_READ_END SIZE_OPCODE -#define SIZE_OP_PREC_READ_NOT_END SIZE_OPCODE -#define SIZE_OP_FAIL SIZE_OPCODE -#define SIZE_OP_MEMORY_START (SIZE_OPCODE + SIZE_MEMNUM) -#define SIZE_OP_MEMORY_START_PUSH (SIZE_OPCODE + SIZE_MEMNUM) -#define SIZE_OP_MEMORY_END_PUSH (SIZE_OPCODE + SIZE_MEMNUM) -#define SIZE_OP_MEMORY_END_PUSH_REC (SIZE_OPCODE + SIZE_MEMNUM) -#define SIZE_OP_MEMORY_END (SIZE_OPCODE + SIZE_MEMNUM) -#define SIZE_OP_MEMORY_END_REC (SIZE_OPCODE + SIZE_MEMNUM) -#define SIZE_OP_ATOMIC_START SIZE_OPCODE -#define SIZE_OP_ATOMIC_END SIZE_OPCODE -#define SIZE_OP_EMPTY_CHECK_START (SIZE_OPCODE + SIZE_MEMNUM) -#define SIZE_OP_EMPTY_CHECK_END (SIZE_OPCODE + SIZE_MEMNUM) -#define SIZE_OP_LOOK_BEHIND (SIZE_OPCODE + SIZE_LENGTH) -#define SIZE_OP_LOOK_BEHIND_NOT_START (SIZE_OPCODE + SIZE_RELADDR + SIZE_LENGTH) -#define SIZE_OP_LOOK_BEHIND_NOT_END SIZE_OPCODE -#define SIZE_OP_CALL (SIZE_OPCODE + SIZE_ABSADDR) -#define SIZE_OP_RETURN SIZE_OPCODE -#define SIZE_OP_PUSH_SAVE_VAL (SIZE_OPCODE + SIZE_SAVE_TYPE + SIZE_MEMNUM) -#define SIZE_OP_UPDATE_VAR (SIZE_OPCODE + SIZE_UPDATE_VAR_TYPE + SIZE_MEMNUM) - -#ifdef USE_CALLOUT -#define SIZE_OP_CALLOUT_CONTENTS (SIZE_OPCODE + SIZE_MEMNUM) -#define SIZE_OP_CALLOUT_NAME (SIZE_OPCODE + SIZE_MEMNUM + SIZE_MEMNUM) -#endif - -#else /* if 0 */ /* for relative address increment to go next op. */ -#define SIZE_INC_OP 1 - -#define SIZE_OP_ANYCHAR_STAR 1 -#define SIZE_OP_ANYCHAR_STAR_PEEK_NEXT 1 -#define SIZE_OP_JUMP 1 -#define SIZE_OP_PUSH 1 -#define SIZE_OP_PUSH_SUPER 1 -#define SIZE_OP_POP_OUT 1 +#define SIZE_INC 1 + +#define OPSIZE_ANYCHAR_STAR 1 +#define OPSIZE_ANYCHAR_STAR_PEEK_NEXT 1 +#define OPSIZE_JUMP 1 +#define OPSIZE_PUSH 1 +#define OPSIZE_PUSH_SUPER 1 +#define OPSIZE_POP_OUT 1 #ifdef USE_OP_PUSH_OR_JUMP_EXACT -#define SIZE_OP_PUSH_OR_JUMP_EXACT1 1 -#endif -#define SIZE_OP_PUSH_IF_PEEK_NEXT 1 -#define SIZE_OP_REPEAT 1 -#define SIZE_OP_REPEAT_INC 1 -#define SIZE_OP_REPEAT_INC_NG 1 -#define SIZE_OP_WORD_BOUNDARY 1 -#define SIZE_OP_PREC_READ_START 1 -#define SIZE_OP_PREC_READ_NOT_START 1 -#define SIZE_OP_PREC_READ_END 1 -#define SIZE_OP_PREC_READ_NOT_END 1 -#define SIZE_OP_BACKREF 1 -#define SIZE_OP_FAIL 1 -#define SIZE_OP_MEMORY_START 1 -#define SIZE_OP_MEMORY_START_PUSH 1 -#define SIZE_OP_MEMORY_END_PUSH 1 -#define SIZE_OP_MEMORY_END_PUSH_REC 1 -#define SIZE_OP_MEMORY_END 1 -#define SIZE_OP_MEMORY_END_REC 1 -#define SIZE_OP_ATOMIC_START 1 -#define SIZE_OP_ATOMIC_END 1 -#define SIZE_OP_EMPTY_CHECK_START 1 -#define SIZE_OP_EMPTY_CHECK_END 1 -#define SIZE_OP_LOOK_BEHIND 1 -#define SIZE_OP_LOOK_BEHIND_NOT_START 1 -#define SIZE_OP_LOOK_BEHIND_NOT_END 1 -#define SIZE_OP_CALL 1 -#define SIZE_OP_RETURN 1 -#define SIZE_OP_PUSH_SAVE_VAL 1 -#define SIZE_OP_UPDATE_VAR 1 +#define OPSIZE_PUSH_OR_JUMP_EXACT1 1 +#endif +#define OPSIZE_PUSH_IF_PEEK_NEXT 1 +#define OPSIZE_REPEAT 1 +#define OPSIZE_REPEAT_INC 1 +#define OPSIZE_REPEAT_INC_NG 1 +#define OPSIZE_WORD_BOUNDARY 1 +#define OPSIZE_PREC_READ_START 1 +#define OPSIZE_PREC_READ_NOT_START 1 +#define OPSIZE_PREC_READ_END 1 +#define OPSIZE_PREC_READ_NOT_END 1 +#define OPSIZE_BACKREF 1 +#define OPSIZE_FAIL 1 +#define OPSIZE_MEM_START 1 +#define OPSIZE_MEM_START_PUSH 1 +#define OPSIZE_MEM_END_PUSH 1 +#define OPSIZE_MEM_END_PUSH_REC 1 +#define OPSIZE_MEM_END 1 +#define OPSIZE_MEM_END_REC 1 +#define OPSIZE_ATOMIC_START 1 +#define OPSIZE_ATOMIC_END 1 +#define OPSIZE_EMPTY_CHECK_START 1 +#define OPSIZE_EMPTY_CHECK_END 1 +#define OPSIZE_LOOK_BEHIND 1 +#define OPSIZE_LOOK_BEHIND_NOT_START 1 +#define OPSIZE_LOOK_BEHIND_NOT_END 1 +#define OPSIZE_CALL 1 +#define OPSIZE_RETURN 1 +#define OPSIZE_PUSH_SAVE_VAL 1 +#define OPSIZE_UPDATE_VAR 1 #ifdef USE_CALLOUT -#define SIZE_OP_CALLOUT_CONTENTS 1 -#define SIZE_OP_CALLOUT_NAME 1 +#define OPSIZE_CALLOUT_CONTENTS 1 +#define OPSIZE_CALLOUT_NAME 1 #endif -#endif /* if 0 */ #define MC_ESC(syn) (syn)->meta_char_table.esc @@ -882,7 +786,7 @@ typedef struct { } repeat; /* REPEAT, REPEAT_NG */ struct { MemNumType id; - } repeat_inc; /* REPEAT_INC, REPEAT_INC_SG, REPEAT_INC_NG, REPEAT_INC_NG_SG */ + } repeat_inc; /* REPEAT_INC, REPEAT_INC_NG */ struct { MemNumType mem; } empty_check_start; @@ -933,48 +837,58 @@ typedef struct { #endif } RegexExt; +typedef struct { + int lower; + int upper; + union { + Operation* pcode; /* address of repeated body */ + int offset; + } u; +} RepeatRange; + struct re_pattern_buffer { /* common members of BBuf(bytes-buffer) */ Operation* ops; #ifdef USE_DIRECT_THREADED_CODE enum OpCode* ocs; #endif - Operation* ops_curr; - unsigned int ops_used; /* used space for ops */ - unsigned int ops_alloc; /* allocated space for ops */ + Operation* ops_curr; + unsigned int ops_used; /* used space for ops */ + unsigned int ops_alloc; /* allocated space for ops */ unsigned char* string_pool; unsigned char* string_pool_end; - int num_mem; /* used memory(...) num counted from 1 */ - int num_repeat; /* OP_REPEAT/OP_REPEAT_NG id-counter */ - int num_null_check; /* OP_EMPTY_CHECK_START/END id counter */ - int num_call; /* number of subexp call */ - unsigned int capture_history; /* (?@...) flag (1-31) */ - unsigned int bt_mem_start; /* need backtrack flag */ - unsigned int bt_mem_end; /* need backtrack flag */ - int stack_pop_level; - int repeat_range_alloc; - OnigRepeatRange* repeat_range; - - OnigEncoding enc; - OnigOptionType options; - OnigSyntaxType* syntax; - OnigCaseFoldType case_fold_flag; - void* name_table; + int num_mem; /* used memory(...) num counted from 1 */ + int num_repeat; /* OP_REPEAT/OP_REPEAT_NG id-counter */ + int num_empty_check; /* OP_EMPTY_CHECK_START/END id counter */ + int num_call; /* number of subexp call */ + MemStatusType capture_history; /* (?@...) flag (1-31) */ + MemStatusType push_mem_start; /* need backtrack flag */ + MemStatusType push_mem_end; /* need backtrack flag */ + MemStatusType empty_status_mem; + int stack_pop_level; + int repeat_range_alloc; + RepeatRange* repeat_range; + + OnigEncoding enc; + OnigOptionType options; + OnigSyntaxType* syntax; + OnigCaseFoldType case_fold_flag; + void* name_table; /* optimization info (string search, char-map and anchors) */ int optimize; /* optimize flag */ int threshold_len; /* search str-length for apply optimize */ int anchor; /* BEGIN_BUF, BEGIN_POS, (SEMI_)END_BUF */ - OnigLen anchor_dmin; /* (SEMI_)END_BUF anchor distance */ - OnigLen anchor_dmax; /* (SEMI_)END_BUF anchor distance */ + OnigLen anc_dist_min; /* (SEMI_)END_BUF anchor distance */ + OnigLen anc_dist_max; /* (SEMI_)END_BUF anchor distance */ int sub_anchor; /* start-anchor for exact or map */ unsigned char *exact; unsigned char *exact_end; unsigned char map[CHAR_MAP_SIZE]; /* used as BMH skip or char-map */ int map_offset; - OnigLen dmin; /* min-distance of exact or map */ - OnigLen dmax; /* max-distance of exact or map */ + OnigLen dist_min; /* min-distance of exact or map */ + OnigLen dist_max; /* max-distance of exact or map */ RegexExt* extp; }; diff --git a/src/regparse.c b/src/regparse.c index 7f8b1a9..fed53f7 100644 --- a/src/regparse.c +++ b/src/regparse.c @@ -2,7 +2,7 @@ regparse.c - Oniguruma (regular expression library) **********************************************************************/ /*- - * Copyright (c) 2002-2019 K.Kosako <sndgk393 AT ybb DOT ne DOT jp> + * Copyright (c) 2002-2019 K.Kosako * All rights reserved. * * Redistribution and use in source and binary forms, with or without @@ -199,6 +199,24 @@ onig_set_parse_depth_limit(unsigned int depth) return 0; } +#ifdef ONIG_DEBUG_PARSE +#define INC_PARSE_DEPTH(d) do {\ + (d)++;\ + if (env->max_parse_depth < (d)) env->max_parse_depth = d;\ + if ((d) > ParseDepthLimit) \ + return ONIGERR_PARSE_DEPTH_LIMIT_OVER;\ +} while (0) +#else +#define INC_PARSE_DEPTH(d) do {\ + (d)++;\ + if ((d) > ParseDepthLimit) \ + return ONIGERR_PARSE_DEPTH_LIMIT_OVER;\ +} while (0) +#endif + +#define DEC_PARSE_DEPTH(d) (d)-- + + static int bbuf_init(BBuf* buf, int size) { @@ -244,7 +262,8 @@ bbuf_clone(BBuf** rto, BBuf* from) return 0; } -static int backref_rel_to_abs(int rel_no, ScanEnv* env) +static int +backref_rel_to_abs(int rel_no, ScanEnv* env) { if (rel_no > 0) { return env->num_mem + rel_no; @@ -292,15 +311,6 @@ bitset_set_range(BitSetRef bs, int from, int to) } } -#if 0 -static void -bitset_set_all(BitSetRef bs) -{ - int i; - for (i = 0; i < BITSET_SIZE; i++) { bs[i] = ~((Bits )0); } -} -#endif - static void bitset_invert(BitSetRef bs) { @@ -363,24 +373,6 @@ save_entry(ScanEnv* env, enum SaveType type, int* id) { int nid = env->save_num; -#if 0 - if (IS_NULL(env->saves)) { - int n = 10; - env->saves = (SaveItem* )xmalloc(sizeof(SaveItem) * n); - CHECK_NULL_RETURN_MEMERR(env->saves); - env->save_alloc_num = n; - } - else if (env->save_alloc_num <= nid) { - int n = env->save_alloc_num * 2; - SaveItem* p = (SaveItem* )xrealloc(env->saves, sizeof(SaveItem) * n); - CHECK_NULL_RETURN_MEMERR(p); - env->saves = p; - env->save_alloc_num = n; - } - - env->saves[nid].type = type; -#endif - env->save_num++; *id = nid; return 0; @@ -476,14 +468,14 @@ static int str_end_hash(st_str_end_key* x) { UChar *p; - int val = 0; + unsigned val = 0; p = x->s; while (p < x->end) { - val = val * 997 + (int )*p++; + val = val * 997 + (unsigned )*p++; } - return val + (val >> 5); + return (int) (val + (val >> 5)); } extern hash_table_type* @@ -566,15 +558,15 @@ static int callout_name_table_hash(st_callout_name_key* x) { UChar *p; - int val = 0; + unsigned int val = 0; p = x->s; while (p < x->end) { - val = val * 997 + (int )*p++; + val = val * 997 + (unsigned int )*p++; } /* use intptr_t for escape warning in Windows */ - return val + (val >> 5) + ((intptr_t )x->enc & 0xffff) + x->type; + return (int )(val + (val >> 5) + ((intptr_t )x->enc & 0xffff) + x->type); } extern hash_table_type* @@ -1972,9 +1964,8 @@ callout_tag_entry(ScanEnv* env, regex_t* reg, UChar* name, UChar* name_end, static void scan_env_clear(ScanEnv* env) { - MEM_STATUS_CLEAR(env->capture_history); - MEM_STATUS_CLEAR(env->bt_mem_start); - MEM_STATUS_CLEAR(env->bt_mem_end); + MEM_STATUS_CLEAR(env->cap_history); + MEM_STATUS_CLEAR(env->backtrack_mem); MEM_STATUS_CLEAR(env->backrefed_mem); env->error = (UChar* )NULL; env->error_end = (UChar* )NULL; @@ -1993,6 +1984,10 @@ scan_env_clear(ScanEnv* env) xmemset(env->mem_env_static, 0, sizeof(env->mem_env_static)); env->parse_depth = 0; +#ifdef ONIG_DEBUG_PARSE + env->max_parse_depth = 0; +#endif + env->backref_num = 0; env->keep_num = 0; env->save_num = 0; env->save_alloc_num = 0; @@ -2024,11 +2019,8 @@ scan_env_add_mem_entry(ScanEnv* env) } for (i = env->num_mem + 1; i < alloc; i++) { - p[i].node = NULL_NODE; -#if 0 - p[i].in = 0; - p[i].recursion = 0; -#endif + p[i].mem_node = NULL_NODE; + p[i].empty_repeat_node = NULL_NODE; } env->mem_env_dynamic = p; @@ -2044,7 +2036,7 @@ static int scan_env_set_mem_node(ScanEnv* env, int num, Node* node) { if (env->num_mem >= num) - SCANENV_MEMENV(env)[num].node = node; + SCANENV_MEMENV(env)[num].mem_node = node; else return ONIGERR_PARSER_BUG; return 0; @@ -2182,7 +2174,7 @@ node_new_ctype(int type, int not, OnigOptionType options) static Node* node_new_anychar(void) { - Node* node = node_new_ctype(CTYPE_ANYCHAR, 0, ONIG_OPTION_NONE); + Node* node = node_new_ctype(CTYPE_ANYCHAR, FALSE, ONIG_OPTION_NONE); return node; } @@ -2242,24 +2234,6 @@ onig_node_new_list(Node* left, Node* right) } extern Node* -onig_node_list_add(Node* list, Node* x) -{ - Node *n; - - n = onig_node_new_list(x, NULL); - if (IS_NULL(n)) return NULL_NODE; - - if (IS_NOT_NULL(list)) { - while (IS_NOT_NULL(NODE_CDR(list))) - list = NODE_CDR(list); - - NODE_CDR(list) = n; - } - - return n; -} - -extern Node* onig_node_new_alt(Node* left, Node* right) { Node* node = node_new(); @@ -2357,7 +2331,7 @@ node_new_backref(int back_num, int* backrefs, int by_name, for (i = 0; i < back_num; i++) { if (backrefs[i] <= env->num_mem && - IS_NULL(SCANENV_MEMENV(env)[backrefs[i]].node)) { + IS_NULL(SCANENV_MEMENV(env)[backrefs[i]].mem_node)) { NODE_STATUS_ADD(node, RECURSION); /* /...(\1).../ */ break; } @@ -2377,6 +2351,8 @@ node_new_backref(int back_num, int* backrefs, int by_name, for (i = 0; i < back_num; i++) p[i] = backrefs[i]; } + + env->backref_num++; return node; } @@ -2424,13 +2400,13 @@ node_new_quantifier(int lower, int upper, int by_number) CHECK_NULL_RETURN(node); NODE_SET_TYPE(node, NODE_QUANT); - QUANT_(node)->lower = lower; - QUANT_(node)->upper = upper; - QUANT_(node)->greedy = 1; - QUANT_(node)->emptiness = BODY_IS_NOT_EMPTY; - QUANT_(node)->head_exact = NULL_NODE; - QUANT_(node)->next_head_exact = NULL_NODE; - QUANT_(node)->is_refered = 0; + QUANT_(node)->lower = lower; + QUANT_(node)->upper = upper; + QUANT_(node)->greedy = 1; + QUANT_(node)->emptiness = BODY_IS_NOT_EMPTY; + QUANT_(node)->head_exact = NULL_NODE; + QUANT_(node)->next_head_exact = NULL_NODE; + QUANT_(node)->include_referred = 0; if (by_number != 0) NODE_STATUS_ADD(node, BY_NUMBER); @@ -2716,7 +2692,7 @@ make_text_segment(Node** node, ScanEnv* env) ns[1] = NULL_NODE; r = ONIGERR_MEMORY; - ns[0] = onig_node_new_anchor(ANCR_NO_TEXT_SEGMENT_BOUNDARY, 0); + ns[0] = onig_node_new_anchor(ANCR_NO_TEXT_SEGMENT_BOUNDARY, FALSE); if (IS_NULL(ns[0])) goto err; r = node_new_true_anychar(&ns[1], env); @@ -2727,7 +2703,7 @@ make_text_segment(Node** node, ScanEnv* env) ns[0] = x; ns[1] = NULL_NODE; - x = node_new_quantifier(0, INFINITE_REPEAT, 1); + x = node_new_quantifier(0, INFINITE_REPEAT, TRUE); if (IS_NULL(x)) goto err; NODE_BODY(x) = ns[0]; @@ -2796,7 +2772,7 @@ make_absent_engine(Node** node, int pre_save_right_id, Node* absent, ns[0] = x; - x = node_new_quantifier(lower, upper, 0); + x = node_new_quantifier(lower, upper, FALSE); if (IS_NULL(x)) goto err0; NODE_BODY(x) = ns[0]; @@ -2825,7 +2801,7 @@ make_absent_engine(Node** node, int pre_save_right_id, Node* absent, x = make_alt(2, ns); if (IS_NULL(x)) goto err0; - if (is_range_cutter != 0) + if (is_range_cutter != FALSE) NODE_STATUS_ADD(x, SUPER); *node = x; @@ -2915,7 +2891,10 @@ make_range_clear(Node** node, ScanEnv* env) ns[0] = NULL_NODE; ns[1] = x; - r = node_new_update_var_gimmick(&ns[0], UPDATE_VAR_RIGHT_RANGE_INIT, 0, env); +#define ID_NOT_USED_DONT_CARE_ME 0 + + r = node_new_update_var_gimmick(&ns[0], UPDATE_VAR_RIGHT_RANGE_INIT, + ID_NOT_USED_DONT_CARE_ME, env); if (r != 0) goto err; x = make_alt(2, ns); @@ -3034,7 +3013,7 @@ make_absent_tree_for_simple_one_char_repeat(Node** node, Node* absent, Node* qua id1 = GIMMICK_(ns[0])->id; r = make_absent_engine(&ns[1], id1, absent, body, lower, upper, possessive, - 0, env); + FALSE, env); if (r != 0) goto err; ns[2] = ns[3] = NULL_NODE; @@ -3077,7 +3056,7 @@ make_absent_tree(Node** node, Node* absent, Node* expr, int is_range_cutter, if (expr == NULL_NODE) { /* default expr \O* */ - quant = node_new_quantifier(0, INFINITE_REPEAT, 0); + quant = node_new_quantifier(0, INFINITE_REPEAT, FALSE); if (IS_NULL(quant)) goto err0; r = node_new_true_anychar(&body, env); @@ -3204,16 +3183,6 @@ node_str_cat_char(Node* node, UChar c) } extern void -onig_node_conv_to_str_node(Node* node, int flag) -{ - NODE_SET_TYPE(node, NODE_STRING); - STR_(node)->flag = flag; - STR_(node)->capacity = 0; - STR_(node)->s = STR_(node)->buf; - STR_(node)->end = STR_(node)->buf; -} - -extern void onig_node_str_clear(Node* node) { if (STR_(node)->capacity != 0 && @@ -3221,10 +3190,11 @@ onig_node_str_clear(Node* node) xfree(STR_(node)->s); } - STR_(node)->capacity = 0; STR_(node)->flag = 0; STR_(node)->s = STR_(node)->buf; STR_(node)->end = STR_(node)->buf; + STR_(node)->capacity = 0; + STR_(node)->case_min_len = 0; } static Node* @@ -3234,10 +3204,12 @@ node_new_str(const UChar* s, const UChar* end) CHECK_NULL_RETURN(node); NODE_SET_TYPE(node, NODE_STRING); - STR_(node)->capacity = 0; STR_(node)->flag = 0; STR_(node)->s = STR_(node)->buf; STR_(node)->end = STR_(node)->buf; + STR_(node)->capacity = 0; + STR_(node)->case_min_len = 0; + if (onig_node_str_cat(node, s, end)) { onig_node_free(node); return NULL; @@ -3252,11 +3224,11 @@ onig_node_new_str(const UChar* s, const UChar* end) } static Node* -node_new_str_raw(UChar* s, UChar* end) +node_new_str_crude(UChar* s, UChar* end) { Node* node = node_new_str(s, end); CHECK_NULL_RETURN(node); - NODE_STRING_SET_RAW(node); + NODE_STRING_SET_CRUDE(node); return node; } @@ -3267,14 +3239,14 @@ node_new_empty(void) } static Node* -node_new_str_raw_char(UChar c) +node_new_str_crude_char(UChar c) { int i; UChar p[1]; Node* node; p[0] = c; - node = node_new_str_raw(p, p + 1); + node = node_new_str_crude(p, p + 1); /* clear buf tail */ for (i = 1; i < NODE_STRING_BUF_SIZE; i++) @@ -3297,8 +3269,8 @@ str_node_split_last_char(Node* node, OnigEncoding enc) if (p && p > sn->s) { /* can be split. */ rn = node_new_str(p, sn->end); CHECK_NULL_RETURN(rn); - if (NODE_STRING_IS_RAW(node)) - NODE_STRING_SET_RAW(rn); + if (NODE_STRING_IS_CRUDE(node)) + NODE_STRING_SET_CRUDE(rn); sn->end = (UChar* )p; } @@ -3316,10 +3288,10 @@ str_node_can_be_split(Node* node, OnigEncoding enc) return 0; } -extern int -onig_scan_unsigned_number(UChar** src, const UChar* end, OnigEncoding enc) +static int +scan_number(UChar** src, const UChar* end, OnigEncoding enc) { - unsigned int num, val; + int num, val; OnigCodePoint c; UChar* p = *src; PFETCH_READY; @@ -3328,8 +3300,8 @@ onig_scan_unsigned_number(UChar** src, const UChar* end, OnigEncoding enc) while (! PEND) { PFETCH(c); if (IS_CODE_DIGIT_ASCII(enc, c)) { - val = (unsigned int )DIGITVAL(c); - if ((INT_MAX_LIMIT - val) / 10UL < num) + val = (int )DIGITVAL(c); + if ((INT_MAX - val) / 10 < num) return -1; /* overflow */ num = num * 10 + val; @@ -3344,26 +3316,27 @@ onig_scan_unsigned_number(UChar** src, const UChar* end, OnigEncoding enc) } static int -scan_unsigned_hexadecimal_number(UChar** src, UChar* end, int minlen, - int maxlen, OnigEncoding enc) +scan_hexadecimal_number(UChar** src, UChar* end, int minlen, int maxlen, + OnigEncoding enc, OnigCodePoint* rcode) { + OnigCodePoint code; OnigCodePoint c; - unsigned int num, val; + unsigned int val; int n; UChar* p = *src; PFETCH_READY; - num = 0; + code = 0; n = 0; while (! PEND && n < maxlen) { PFETCH(c); if (IS_CODE_XDIGIT_ASCII(enc, c)) { n++; - val = (unsigned int )XDIGITVAL(enc,c); - if ((INT_MAX_LIMIT - val) / 16UL < num) + val = (unsigned int )XDIGITVAL(enc, c); + if ((UINT_MAX - val) / 16UL < code) return ONIGERR_TOO_BIG_NUMBER; /* overflow */ - num = (num << 4) + XDIGITVAL(enc,c); + code = (code << 4) + val; } else { PUNFETCH; @@ -3374,36 +3347,46 @@ scan_unsigned_hexadecimal_number(UChar** src, UChar* end, int minlen, if (n < minlen) return ONIGERR_INVALID_CODE_POINT_VALUE; + *rcode = code; *src = p; - return num; + return ONIG_NORMAL; } static int -scan_unsigned_octal_number(UChar** src, UChar* end, int maxlen, - OnigEncoding enc) +scan_octal_number(UChar** src, UChar* end, int minlen, int maxlen, + OnigEncoding enc, OnigCodePoint* rcode) { + OnigCodePoint code; OnigCodePoint c; - unsigned int num, val; + unsigned int val; + int n; UChar* p = *src; PFETCH_READY; - num = 0; - while (! PEND && maxlen-- != 0) { + code = 0; + n = 0; + while (! PEND && n < maxlen) { PFETCH(c); if (IS_CODE_DIGIT_ASCII(enc, c) && c < '8') { - val = ODIGITVAL(c); - if ((INT_MAX_LIMIT - val) / 8UL < num) - return -1; /* overflow */ + n++; + val = (unsigned int )ODIGITVAL(c); + if ((UINT_MAX - val) / 8UL < code) + return ONIGERR_TOO_BIG_NUMBER; /* overflow */ - num = (num << 3) + val; + code = (code << 3) + val; } else { PUNFETCH; break; } } + + if (n < minlen) + return ONIGERR_INVALID_CODE_POINT_VALUE; + + *rcode = code; *src = p; - return num; + return ONIG_NORMAL; } @@ -3938,68 +3921,70 @@ static enum ReduceType ReduceTypeTable[6][6] = { {RQ_ASIS, RQ_PQ_Q, RQ_DEL, RQ_AQ, RQ_AQ, RQ_DEL} /* '+?' */ }; -extern void -onig_reduce_nested_quantifier(Node* pnode, Node* cnode) +extern int +onig_reduce_nested_quantifier(Node* pnode) { int pnum, cnum; QuantNode *p, *c; + Node* cnode; + + cnode = NODE_BODY(pnode); p = QUANT_(pnode); c = QUANT_(cnode); pnum = quantifier_type_num(p); cnum = quantifier_type_num(c); if (pnum < 0 || cnum < 0) { - if ((p->lower == p->upper) && ! IS_INFINITE_REPEAT(p->upper)) { - if ((c->lower == c->upper) && ! IS_INFINITE_REPEAT(c->upper)) { - int n = onig_positive_int_multiply(p->lower, c->lower); - if (n >= 0) { - p->lower = p->upper = n; - NODE_BODY(pnode) = NODE_BODY(cnode); - goto remove_cnode; - } - } + if (p->lower == p->upper && c->lower == c->upper) { + int n = onig_positive_int_multiply(p->lower, c->lower); + if (n < 0) return ONIGERR_TOO_BIG_NUMBER_FOR_REPEAT_RANGE; + + p->lower = p->upper = n; + NODE_BODY(pnode) = NODE_BODY(cnode); + goto remove_cnode; } - return ; + return 0; } switch(ReduceTypeTable[cnum][pnum]) { case RQ_DEL: *pnode = *cnode; + goto remove_cnode; break; case RQ_A: NODE_BODY(pnode) = NODE_BODY(cnode); p->lower = 0; p->upper = INFINITE_REPEAT; p->greedy = 1; + goto remove_cnode; break; case RQ_AQ: NODE_BODY(pnode) = NODE_BODY(cnode); p->lower = 0; p->upper = INFINITE_REPEAT; p->greedy = 0; + goto remove_cnode; break; case RQ_QQ: NODE_BODY(pnode) = NODE_BODY(cnode); p->lower = 0; p->upper = 1; p->greedy = 0; + goto remove_cnode; break; case RQ_P_QQ: - NODE_BODY(pnode) = cnode; p->lower = 0; p->upper = 1; p->greedy = 0; c->lower = 1; c->upper = INFINITE_REPEAT; c->greedy = 1; - return ; break; case RQ_PQ_Q: - NODE_BODY(pnode) = cnode; p->lower = 0; p->upper = 1; p->greedy = 1; c->lower = 1; c->upper = INFINITE_REPEAT; c->greedy = 0; - return ; break; case RQ_ASIS: - NODE_BODY(pnode) = cnode; - return ; break; } + return 0; + remove_cnode: NODE_BODY(cnode) = NULL_NODE; onig_node_free(cnode); + return 0; } static int @@ -4018,7 +4003,7 @@ node_new_general_newline(Node** node, ScanEnv* env) alen = ONIGENC_CODE_TO_MBC(env->enc, 0x0a, buf + dlen); if (alen < 0) return alen; - crnl = node_new_str_raw(buf, buf + dlen + alen); + crnl = node_new_str_crude(buf, buf + dlen + alen); CHECK_NULL_RETURN_MEMERR(crnl); ncc = node_new_cclass(); @@ -4046,7 +4031,7 @@ node_new_general_newline(Node** node, ScanEnv* env) if (r != 0) goto err1; } - x = node_new_bag_if_else(crnl, 0, ncc); + x = node_new_bag_if_else(crnl, NULL_NODE, ncc); if (IS_NULL(x)) goto err1; *node = x; @@ -4055,7 +4040,7 @@ node_new_general_newline(Node** node, ScanEnv* env) enum TokenSyms { TK_EOT = 0, /* end of token */ - TK_RAW_BYTE = 1, + TK_CRUDE_BYTE = 1, TK_CHAR, TK_STRING, TK_CODE_POINT, @@ -4070,7 +4055,7 @@ enum TokenSyms { TK_ALT, TK_SUBEXP_OPEN, TK_SUBEXP_CLOSE, - TK_CC_OPEN, + TK_OPEN_CC, TK_QUOTE_OPEN, TK_CHAR_PROPERTY, /* \p{...}, \P{...} */ TK_KEEP, /* \K */ @@ -4082,9 +4067,9 @@ enum TokenSyms { /* in cc */ TK_CC_CLOSE, TK_CC_RANGE, - TK_POSIX_BRACKET_OPEN, - TK_CC_AND, /* && */ - TK_CC_CC_OPEN /* [ */ + TK_CC_POSIX_BRACKET_OPEN, + TK_CC_AND, /* && */ + TK_CC_OPEN_CC /* [ */ }; typedef struct { @@ -4094,7 +4079,7 @@ typedef struct { UChar* backp; union { UChar* s; - int c; + UChar byte; OnigCodePoint code; int anchor; int subtype; @@ -4129,7 +4114,7 @@ typedef struct { static int -fetch_interval_quantifier(UChar** src, UChar* end, PToken* tok, ScanEnv* env) +fetch_interval(UChar** src, UChar* end, PToken* tok, ScanEnv* env) { int low, up, syn_allow, non_low = 0; int r = 0; @@ -4154,7 +4139,7 @@ fetch_interval_quantifier(UChar** src, UChar* end, PToken* tok, ScanEnv* env) } } - low = onig_scan_unsigned_number(&p, end, env->enc); + low = scan_number(&p, end, env->enc); if (low < 0) return ONIGERR_TOO_BIG_NUMBER_FOR_REPEAT_RANGE; if (low > ONIG_MAX_REPEAT_NUM) return ONIGERR_TOO_BIG_NUMBER_FOR_REPEAT_RANGE; @@ -4173,7 +4158,7 @@ fetch_interval_quantifier(UChar** src, UChar* end, PToken* tok, ScanEnv* env) PFETCH(c); if (c == ',') { UChar* prev = p; - up = onig_scan_unsigned_number(&p, end, env->enc); + up = scan_number(&p, end, env->enc); if (up < 0) return ONIGERR_TOO_BIG_NUMBER_FOR_REPEAT_RANGE; if (up > ONIG_MAX_REPEAT_NUM) return ONIGERR_TOO_BIG_NUMBER_FOR_REPEAT_RANGE; @@ -4196,7 +4181,7 @@ fetch_interval_quantifier(UChar** src, UChar* end, PToken* tok, ScanEnv* env) if (PEND) goto invalid; PFETCH(c); if (IS_SYNTAX_OP(env->syntax, ONIG_SYN_OP_ESC_BRACE_INTERVAL)) { - if (c != MC_ESC(env->syntax)) goto invalid; + if (c != MC_ESC(env->syntax) || PEND) goto invalid; PFETCH(c); } if (c != '}') goto invalid; @@ -4419,7 +4404,7 @@ fetch_name_with_level(OnigCodePoint start_code, UChar** src, UChar* end, PFETCH(c); if (! IS_CODE_DIGIT_ASCII(enc, c)) goto err; PUNFETCH; - level = onig_scan_unsigned_number(&p, end, enc); + level = scan_number(&p, end, enc); if (level < 0) return ONIGERR_TOO_BIG_NUMBER; *rlevel = (level * flag); exist_level = 1; @@ -4440,7 +4425,7 @@ fetch_name_with_level(OnigCodePoint start_code, UChar** src, UChar* end, end: if (r == 0) { if (*num_type != IS_NOT_NUM) { - *rback_num = onig_scan_unsigned_number(&pnum_head, name_end, enc); + *rback_num = scan_number(&pnum_head, name_end, enc); if (*rback_num < 0) return ONIGERR_TOO_BIG_NUMBER; else if (*rback_num == 0) { if (*num_type == IS_REL_NUM) @@ -4468,7 +4453,7 @@ fetch_name_with_level(OnigCodePoint start_code, UChar** src, UChar* end, static int fetch_name(OnigCodePoint start_code, UChar** src, UChar* end, UChar** rname_end, ScanEnv* env, int* rback_num, - enum REF_NUM* num_type, int ref) + enum REF_NUM* num_type, int is_ref) { int r, sign; int digit_count; @@ -4498,7 +4483,7 @@ fetch_name(OnigCodePoint start_code, UChar** src, UChar* end, return ONIGERR_EMPTY_GROUP_NAME; if (IS_CODE_DIGIT_ASCII(enc, c)) { - if (ref == 1) + if (is_ref == TRUE) *num_type = IS_ABS_NUM; else { r = ONIGERR_INVALID_GROUP_NAME; @@ -4506,7 +4491,7 @@ fetch_name(OnigCodePoint start_code, UChar** src, UChar* end, digit_count++; } else if (c == '-') { - if (ref == 1) { + if (is_ref == TRUE) { *num_type = IS_REL_NUM; sign = -1; pnum_head = p; @@ -4516,7 +4501,7 @@ fetch_name(OnigCodePoint start_code, UChar** src, UChar* end, } } else if (c == '+') { - if (ref == 1) { + if (is_ref == TRUE) { *num_type = IS_REL_NUM; sign = 1; pnum_head = p; @@ -4566,7 +4551,7 @@ fetch_name(OnigCodePoint start_code, UChar** src, UChar* end, } if (*num_type != IS_NOT_NUM) { - *rback_num = onig_scan_unsigned_number(&pnum_head, name_end, enc); + *rback_num = scan_number(&pnum_head, name_end, enc); if (*rback_num < 0) return ONIGERR_TOO_BIG_NUMBER; else if (*rback_num == 0) { if (*num_type == IS_REL_NUM) { @@ -4698,7 +4683,8 @@ str_exist_check_with_esc(OnigCodePoint s[], int n, UChar* from, UChar* to, static int fetch_token_in_cc(PToken* tok, UChar** src, UChar* end, ScanEnv* env) { - int num; + int r; + OnigCodePoint code; OnigCodePoint c, c2; OnigSyntaxType* syn = env->syntax; OnigEncoding enc = env->enc; @@ -4714,7 +4700,7 @@ fetch_token_in_cc(PToken* tok, UChar** src, UChar* end, ScanEnv* env) PFETCH(c); tok->type = TK_CHAR; tok->base = 0; - tok->u.c = c; + tok->u.code = c; tok->escaped = 0; if (c == ']') { @@ -4731,7 +4717,7 @@ fetch_token_in_cc(PToken* tok, UChar** src, UChar* end, ScanEnv* env) PFETCH(c); tok->escaped = 1; - tok->u.c = c; + tok->u.code = c; switch (c) { case 'w': tok->type = TK_CHAR_TYPE; @@ -4804,8 +4790,8 @@ fetch_token_in_cc(PToken* tok, UChar** src, UChar* end, ScanEnv* env) prev = p; if (PPEEK_IS('{') && IS_SYNTAX_OP(syn, ONIG_SYN_OP_ESC_O_BRACE_OCTAL)) { PINC; - num = scan_unsigned_octal_number(&p, end, 11, enc); - if (num < 0) return ONIGERR_TOO_BIG_WIDE_CHAR_VALUE; + r = scan_octal_number(&p, end, 0, 11, enc, &code); + if (r < 0) return r; if (!PEND) { c2 = PPEEK; if (IS_CODE_DIGIT_ASCII(enc, c2)) @@ -4816,7 +4802,7 @@ fetch_token_in_cc(PToken* tok, UChar** src, UChar* end, ScanEnv* env) PINC; tok->type = TK_CODE_POINT; tok->base = 8; - tok->u.code = (OnigCodePoint )num; + tok->u.code = code; } else { /* can't read nothing or invalid format */ @@ -4831,13 +4817,8 @@ fetch_token_in_cc(PToken* tok, UChar** src, UChar* end, ScanEnv* env) prev = p; if (PPEEK_IS('{') && IS_SYNTAX_OP(syn, ONIG_SYN_OP_ESC_X_BRACE_HEX8)) { PINC; - num = scan_unsigned_hexadecimal_number(&p, end, 0, 8, enc); - if (num < 0) { - if (num == ONIGERR_TOO_BIG_NUMBER) - return ONIGERR_TOO_BIG_WIDE_CHAR_VALUE; - else - return num; - } + r = scan_hexadecimal_number(&p, end, 0, 8, enc, &code); + if (r < 0) return r; if (!PEND) { c2 = PPEEK; if (IS_CODE_XDIGIT_ASCII(enc, c2)) @@ -4848,7 +4829,7 @@ fetch_token_in_cc(PToken* tok, UChar** src, UChar* end, ScanEnv* env) PINC; tok->type = TK_CODE_POINT; tok->base = 16; - tok->u.code = (OnigCodePoint )num; + tok->u.code = code; } else { /* can't read nothing or invalid format */ @@ -4856,14 +4837,14 @@ fetch_token_in_cc(PToken* tok, UChar** src, UChar* end, ScanEnv* env) } } else if (IS_SYNTAX_OP(syn, ONIG_SYN_OP_ESC_X_HEX2)) { - num = scan_unsigned_hexadecimal_number(&p, end, 0, 2, enc); - if (num < 0) return num; + r = scan_hexadecimal_number(&p, end, 0, 2, enc, &code); + if (r < 0) return r; if (p == prev) { /* can't read nothing. */ - num = 0; /* but, it's not error */ + code = 0; /* but, it's not error */ } - tok->type = TK_RAW_BYTE; + tok->type = TK_CRUDE_BYTE; tok->base = 16; - tok->u.c = num; + tok->u.byte = (UChar )code; } break; @@ -4872,14 +4853,14 @@ fetch_token_in_cc(PToken* tok, UChar** src, UChar* end, ScanEnv* env) prev = p; if (IS_SYNTAX_OP2(syn, ONIG_SYN_OP2_ESC_U_HEX4)) { - num = scan_unsigned_hexadecimal_number(&p, end, 4, 4, enc); - if (num < 0) return num; + r = scan_hexadecimal_number(&p, end, 4, 4, enc, &code); + if (r < 0) return r; if (p == prev) { /* can't read nothing. */ - num = 0; /* but, it's not error */ + code = 0; /* but, it's not error */ } tok->type = TK_CODE_POINT; tok->base = 16; - tok->u.code = (OnigCodePoint )num; + tok->u.code = code; } break; @@ -4888,22 +4869,23 @@ fetch_token_in_cc(PToken* tok, UChar** src, UChar* end, ScanEnv* env) if (IS_SYNTAX_OP(syn, ONIG_SYN_OP_ESC_OCTAL3)) { PUNFETCH; prev = p; - num = scan_unsigned_octal_number(&p, end, 3, enc); - if (num < 0 || num >= 256) return ONIGERR_TOO_BIG_NUMBER; + r = scan_octal_number(&p, end, 0, 3, enc, &code); + if (r < 0) return r; + if (code >= 256) return ONIGERR_TOO_BIG_NUMBER; if (p == prev) { /* can't read nothing. */ - num = 0; /* but, it's not error */ + code = 0; /* but, it's not error */ } - tok->type = TK_RAW_BYTE; + tok->type = TK_CRUDE_BYTE; tok->base = 8; - tok->u.c = num; + tok->u.byte = (UChar )code; } break; default: PUNFETCH; - num = fetch_escaped_value(&p, end, env, &c2); - if (num < 0) return num; - if (tok->u.c != c2) { + r = fetch_escaped_value(&p, end, env, &c2); + if (r < 0) return r; + if (tok->u.code != c2) { tok->u.code = c2; tok->type = TK_CODE_POINT; } @@ -4917,7 +4899,7 @@ fetch_token_in_cc(PToken* tok, UChar** src, UChar* end, ScanEnv* env) PINC; if (str_exist_check_with_esc(send, 2, p, end, (OnigCodePoint )']', enc, syn)) { - tok->type = TK_POSIX_BRACKET_OPEN; + tok->type = TK_CC_POSIX_BRACKET_OPEN; } else { PUNFETCH; @@ -4927,7 +4909,7 @@ fetch_token_in_cc(PToken* tok, UChar** src, UChar* end, ScanEnv* env) else { cc_in_cc: if (IS_SYNTAX_OP2(syn, ONIG_SYN_OP2_CCLASS_SET_OP)) { - tok->type = TK_CC_CC_OPEN; + tok->type = TK_CC_OPEN_CC; } else { CC_ESC_WARN(env, (UChar* )"["); @@ -4950,7 +4932,8 @@ fetch_token_in_cc(PToken* tok, UChar** src, UChar* end, ScanEnv* env) static int fetch_token(PToken* tok, UChar** src, UChar* end, ScanEnv* env) { - int r, num; + int r; + OnigCodePoint code; OnigCodePoint c; OnigEncoding enc = env->enc; OnigSyntaxType* syn = env->syntax; @@ -4975,7 +4958,7 @@ fetch_token(PToken* tok, UChar** src, UChar* end, ScanEnv* env) tok->backp = p; PFETCH(c); - tok->u.c = c; + tok->u.code = c; tok->escaped = 1; switch (c) { case '*': @@ -5026,7 +5009,7 @@ fetch_token(PToken* tok, UChar** src, UChar* end, ScanEnv* env) case '{': if (! IS_SYNTAX_OP(syn, ONIG_SYN_OP_ESC_BRACE_INTERVAL)) break; - r = fetch_interval_quantifier(&p, end, tok, env); + r = fetch_interval(&p, end, tok, env); if (r < 0) return r; /* error */ if (r == 0) goto greedy_check2; else if (r == 2) { /* {n} */ @@ -5214,8 +5197,8 @@ fetch_token(PToken* tok, UChar** src, UChar* end, ScanEnv* env) prev = p; if (PPEEK_IS('{') && IS_SYNTAX_OP(syn, ONIG_SYN_OP_ESC_O_BRACE_OCTAL)) { PINC; - num = scan_unsigned_octal_number(&p, end, 11, enc); - if (num < 0) return ONIGERR_TOO_BIG_WIDE_CHAR_VALUE; + r = scan_octal_number(&p, end, 0, 11, enc, &code); + if (r < 0) return r; if (!PEND) { if (IS_CODE_DIGIT_ASCII(enc, PPEEK)) return ONIGERR_TOO_LONG_WIDE_CHAR_VALUE; @@ -5224,7 +5207,7 @@ fetch_token(PToken* tok, UChar** src, UChar* end, ScanEnv* env) if ((p > prev + enclen(enc, prev)) && !PEND && PPEEK_IS('}')) { PINC; tok->type = TK_CODE_POINT; - tok->u.code = (OnigCodePoint )num; + tok->u.code = code; } else { /* can't read nothing or invalid format */ @@ -5239,13 +5222,8 @@ fetch_token(PToken* tok, UChar** src, UChar* end, ScanEnv* env) prev = p; if (PPEEK_IS('{') && IS_SYNTAX_OP(syn, ONIG_SYN_OP_ESC_X_BRACE_HEX8)) { PINC; - num = scan_unsigned_hexadecimal_number(&p, end, 0, 8, enc); - if (num < 0) { - if (num == ONIGERR_TOO_BIG_NUMBER) - return ONIGERR_TOO_BIG_WIDE_CHAR_VALUE; - else - return num; - } + r = scan_hexadecimal_number(&p, end, 0, 8, enc, &code); + if (r < 0) return r; if (!PEND) { if (IS_CODE_XDIGIT_ASCII(enc, PPEEK)) return ONIGERR_TOO_LONG_WIDE_CHAR_VALUE; @@ -5254,7 +5232,7 @@ fetch_token(PToken* tok, UChar** src, UChar* end, ScanEnv* env) if ((p > prev + enclen(enc, prev)) && !PEND && PPEEK_IS('}')) { PINC; tok->type = TK_CODE_POINT; - tok->u.code = (OnigCodePoint )num; + tok->u.code = code; } else { /* can't read nothing or invalid format */ @@ -5262,14 +5240,14 @@ fetch_token(PToken* tok, UChar** src, UChar* end, ScanEnv* env) } } else if (IS_SYNTAX_OP(syn, ONIG_SYN_OP_ESC_X_HEX2)) { - num = scan_unsigned_hexadecimal_number(&p, end, 0, 2, enc); - if (num < 0) return num; + r = scan_hexadecimal_number(&p, end, 0, 2, enc, &code); + if (r < 0) return r; if (p == prev) { /* can't read nothing. */ - num = 0; /* but, it's not error */ + code = 0; /* but, it's not error */ } - tok->type = TK_RAW_BYTE; + tok->type = TK_CRUDE_BYTE; tok->base = 16; - tok->u.c = num; + tok->u.byte = (UChar )code; } break; @@ -5278,14 +5256,14 @@ fetch_token(PToken* tok, UChar** src, UChar* end, ScanEnv* env) prev = p; if (IS_SYNTAX_OP2(syn, ONIG_SYN_OP2_ESC_U_HEX4)) { - num = scan_unsigned_hexadecimal_number(&p, end, 4, 4, enc); - if (num < 0) return num; + r = scan_hexadecimal_number(&p, end, 4, 4, enc, &code); + if (r < 0) return r; if (p == prev) { /* can't read nothing. */ - num = 0; /* but, it's not error */ + code = 0; /* but, it's not error */ } tok->type = TK_CODE_POINT; tok->base = 16; - tok->u.code = (OnigCodePoint )num; + tok->u.code = code; } break; @@ -5293,21 +5271,21 @@ fetch_token(PToken* tok, UChar** src, UChar* end, ScanEnv* env) case '5': case '6': case '7': case '8': case '9': PUNFETCH; prev = p; - num = onig_scan_unsigned_number(&p, end, enc); - if (num < 0 || num > ONIG_MAX_BACKREF_NUM) { + r = scan_number(&p, end, enc); + if (r < 0 || r > ONIG_MAX_BACKREF_NUM) { goto skip_backref; } if (IS_SYNTAX_OP(syn, ONIG_SYN_OP_DECIMAL_BACKREF) && - (num <= env->num_mem || num <= 9)) { /* This spec. from GNU regex */ + (r <= env->num_mem || r <= 9)) { /* This spec. from GNU regex */ if (IS_SYNTAX_BV(syn, ONIG_SYN_STRICT_CHECK_BACKREF)) { - if (num > env->num_mem || IS_NULL(SCANENV_MEMENV(env)[num].node)) + if (r > env->num_mem || IS_NULL(SCANENV_MEMENV(env)[r].mem_node)) return ONIGERR_INVALID_BACKREF; } tok->type = TK_BACKREF; tok->u.backref.num = 1; - tok->u.backref.ref1 = num; + tok->u.backref.ref1 = r; tok->u.backref.by_name = 0; #ifdef USE_BACKREF_WITH_LEVEL tok->u.backref.exist_level = 0; @@ -5327,14 +5305,14 @@ fetch_token(PToken* tok, UChar** src, UChar* end, ScanEnv* env) case '0': if (IS_SYNTAX_OP(syn, ONIG_SYN_OP_ESC_OCTAL3)) { prev = p; - num = scan_unsigned_octal_number(&p, end, (c == '0' ? 2:3), enc); - if (num < 0 || num >= 256) return ONIGERR_TOO_BIG_NUMBER; + r = scan_octal_number(&p, end, 0, (c == '0' ? 2:3), enc, &code); + if (r < 0 || r >= 256) return ONIGERR_TOO_BIG_NUMBER; if (p == prev) { /* can't read nothing. */ - num = 0; /* but, it's not error */ + code = 0; /* but, it's not error */ } - tok->type = TK_RAW_BYTE; + tok->type = TK_CRUDE_BYTE; tok->base = 8; - tok->u.c = num; + tok->u.byte = (UChar )code; } else if (c != '0') { PINC; @@ -5359,7 +5337,7 @@ fetch_token(PToken* tok, UChar** src, UChar* end, ScanEnv* env) if (r == 1) tok->u.backref.exist_level = 1; else tok->u.backref.exist_level = 0; #else - r = fetch_name(c, &p, end, &name_end, env, &back_num, &num_type, 1); + r = fetch_name(c, &p, end, &name_end, env, &back_num, &num_type, TRUE); #endif if (r < 0) return r; @@ -5372,7 +5350,7 @@ fetch_token(PToken* tok, UChar** src, UChar* end, ScanEnv* env) if (IS_SYNTAX_BV(syn, ONIG_SYN_STRICT_CHECK_BACKREF)) { if (back_num > env->num_mem || - IS_NULL(SCANENV_MEMENV(env)[back_num].node)) + IS_NULL(SCANENV_MEMENV(env)[back_num].mem_node)) return ONIGERR_INVALID_BACKREF; } tok->type = TK_BACKREF; @@ -5381,7 +5359,7 @@ fetch_token(PToken* tok, UChar** src, UChar* end, ScanEnv* env) tok->u.backref.ref1 = back_num; } else { - num = name_to_group_numbers(env, prev, name_end, &backs); + int num = name_to_group_numbers(env, prev, name_end, &backs); if (num <= 0) { return ONIGERR_UNDEFINED_NAME_REFERENCE; } @@ -5389,7 +5367,7 @@ fetch_token(PToken* tok, UChar** src, UChar* end, ScanEnv* env) int i; for (i = 0; i < num; i++) { if (backs[i] > env->num_mem || - IS_NULL(SCANENV_MEMENV(env)[backs[i]].node)) + IS_NULL(SCANENV_MEMENV(env)[backs[i]].mem_node)) return ONIGERR_INVALID_BACKREF; } } @@ -5422,7 +5400,7 @@ fetch_token(PToken* tok, UChar** src, UChar* end, ScanEnv* env) prev = p; r = fetch_name((OnigCodePoint )c, &p, end, &name_end, env, - &gnum, &num_type, 1); + &gnum, &num_type, TRUE); if (r < 0) return r; if (num_type != IS_NOT_NUM) { @@ -5483,10 +5461,9 @@ fetch_token(PToken* tok, UChar** src, UChar* end, ScanEnv* env) OnigCodePoint c2; PUNFETCH; - num = fetch_escaped_value(&p, end, env, &c2); - if (num < 0) return num; - /* set_raw: */ - if (tok->u.c != c2) { + r = fetch_escaped_value(&p, end, env, &c2); + if (r < 0) return r; + if (tok->u.code != c2) { tok->type = TK_CODE_POINT; tok->u.code = c2; } @@ -5498,7 +5475,7 @@ fetch_token(PToken* tok, UChar** src, UChar* end, ScanEnv* env) } } else { - tok->u.c = c; + tok->u.code = c; tok->escaped = 0; #ifdef USE_VARIABLE_META_CHARS @@ -5563,7 +5540,7 @@ fetch_token(PToken* tok, UChar** src, UChar* end, ScanEnv* env) case '{': if (! IS_SYNTAX_OP(syn, ONIG_SYN_OP_BRACE_INTERVAL)) break; - r = fetch_interval_quantifier(&p, end, tok, env); + r = fetch_interval(&p, end, tok, env); if (r < 0) return r; /* error */ if (r == 0) goto greedy_check2; else if (r == 2) { /* {n} */ @@ -5611,8 +5588,8 @@ fetch_token(PToken* tok, UChar** src, UChar* end, ScanEnv* env) { PINC; name = p; - r = fetch_name((OnigCodePoint )'(', &p, end, &name_end, env, &gnum, - &num_type, 0); + r = fetch_name((OnigCodePoint )'(', &p, end, &name_end, env, + &gnum, &num_type, FALSE); if (r < 0) return r; tok->type = TK_CALL; @@ -5644,7 +5621,7 @@ fetch_token(PToken* tok, UChar** src, UChar* end, ScanEnv* env) { name = p; r = fetch_name((OnigCodePoint )'(', &p, end, &name_end, env, - &gnum, &num_type, 1); + &gnum, &num_type, TRUE); if (r < 0) return r; if (num_type == IS_NOT_NUM) { @@ -5700,7 +5677,7 @@ fetch_token(PToken* tok, UChar** src, UChar* end, ScanEnv* env) case '[': if (! IS_SYNTAX_OP(syn, ONIG_SYN_OP_BRACKET_CC)) break; - tok->type = TK_CC_OPEN; + tok->type = TK_OPEN_CC; break; case ']': @@ -5911,6 +5888,7 @@ add_ctype_to_cc(CClassNode* cc, int ctype, int not, ScanEnv* env) int c, r; int ascii_mode; + int is_single; const OnigCodePoint *ranges; OnigCodePoint limit; OnigCodePoint sb_out; @@ -5932,6 +5910,7 @@ add_ctype_to_cc(CClassNode* cc, int ctype, int not, ScanEnv* env) } r = 0; + is_single = ONIGENC_IS_SINGLEBYTE(enc); limit = ascii_mode ? ASCII_LIMIT : SINGLE_BYTE_SIZE; switch (ctype) { @@ -5948,19 +5927,25 @@ add_ctype_to_cc(CClassNode* cc, int ctype, int not, ScanEnv* env) case ONIGENC_CTYPE_ALNUM: if (not != 0) { for (c = 0; c < (int )limit; c++) { - if (! ONIGENC_IS_CODE_CTYPE(enc, (OnigCodePoint )c, ctype)) - BITSET_SET_BIT(cc->bs, c); + if (is_single != 0 || ONIGENC_CODE_TO_MBCLEN(enc, c) == 1) { + if (! ONIGENC_IS_CODE_CTYPE(enc, (OnigCodePoint )c, ctype)) + BITSET_SET_BIT(cc->bs, c); + } } for (c = limit; c < SINGLE_BYTE_SIZE; c++) { - BITSET_SET_BIT(cc->bs, c); + if (is_single != 0 || ONIGENC_CODE_TO_MBCLEN(enc, c) == 1) + BITSET_SET_BIT(cc->bs, c); } - ADD_ALL_MULTI_BYTE_RANGE(enc, cc->mbuf); + if (is_single == 0) + ADD_ALL_MULTI_BYTE_RANGE(enc, cc->mbuf); } else { for (c = 0; c < (int )limit; c++) { - if (ONIGENC_IS_CODE_CTYPE(enc, (OnigCodePoint )c, ctype)) - BITSET_SET_BIT(cc->bs, c); + if (is_single != 0 || ONIGENC_CODE_TO_MBCLEN(enc, c) == 1) { + if (ONIGENC_IS_CODE_CTYPE(enc, (OnigCodePoint )c, ctype)) + BITSET_SET_BIT(cc->bs, c); + } } } break; @@ -5970,21 +5955,25 @@ add_ctype_to_cc(CClassNode* cc, int ctype, int not, ScanEnv* env) case ONIGENC_CTYPE_WORD: if (not != 0) { for (c = 0; c < (int )limit; c++) { - if (ONIGENC_CODE_TO_MBCLEN(enc, c) > 0 /* check invalid code point */ + /* check invalid code point */ + if ((is_single != 0 || ONIGENC_CODE_TO_MBCLEN(enc, c) == 1) && ! ONIGENC_IS_CODE_CTYPE(enc, (OnigCodePoint )c, ctype)) BITSET_SET_BIT(cc->bs, c); } for (c = limit; c < SINGLE_BYTE_SIZE; c++) { - if (ONIGENC_CODE_TO_MBCLEN(enc, c) > 0) + if (is_single != 0 || ONIGENC_CODE_TO_MBCLEN(enc, c) == 1) BITSET_SET_BIT(cc->bs, c); } + if (ascii_mode != 0 && is_single == 0) + ADD_ALL_MULTI_BYTE_RANGE(enc, cc->mbuf); } else { for (c = 0; c < (int )limit; c++) { - if (ONIGENC_IS_CODE_CTYPE(enc, (OnigCodePoint )c, ctype)) + if ((is_single != 0 || ONIGENC_CODE_TO_MBCLEN(enc, c) == 1) + && ONIGENC_IS_CODE_CTYPE(enc, (OnigCodePoint )c, ctype)) BITSET_SET_BIT(cc->bs, c); } - if (ascii_mode == 0) + if (ascii_mode == 0 && is_single == 0) ADD_ALL_MULTI_BYTE_RANGE(enc, cc->mbuf); } break; @@ -6076,10 +6065,12 @@ fetch_char_property_to_ctype(UChar** src, UChar* end, ScanEnv* env) { int r; OnigCodePoint c; - OnigEncoding enc = env->enc; - UChar *prev, *start, *p = *src; + OnigEncoding enc; + UChar *prev, *start, *p; - r = 0; + p = *src; + enc = env->enc; + r = ONIGERR_END_PATTERN_WITH_UNMATCHED_PARENTHESIS; start = prev = p; while (!PEND) { @@ -6087,18 +6078,20 @@ fetch_char_property_to_ctype(UChar** src, UChar* end, ScanEnv* env) PFETCH_S(c); if (c == '}') { r = ONIGENC_PROPERTY_NAME_TO_CTYPE(enc, start, prev); - if (r < 0) break; + if (r >= 0) { + *src = p; + } + else { + onig_scan_env_set_error_string(env, r, *src, prev); + } - *src = p; return r; } else if (c == '(' || c == ')' || c == '{' || c == '|') { - r = ONIGERR_INVALID_CHAR_PROPERTY_NAME; break; } } - onig_scan_env_set_error_string(env, r, *src, prev); return r; } @@ -6114,7 +6107,7 @@ parse_char_property(Node** np, PToken* tok, UChar** src, UChar* end, ScanEnv* en *np = node_new_cclass(); CHECK_NULL_RETURN_MEMERR(*np); cc = CCLASS_(*np); - r = add_ctype_to_cc(cc, ctype, 0, env); + r = add_ctype_to_cc(cc, ctype, FALSE, env); if (r != 0) return r; if (tok->u.prop.not != 0) NCCLASS_SET_NOT(cc); @@ -6122,67 +6115,67 @@ parse_char_property(Node** np, PToken* tok, UChar** src, UChar* end, ScanEnv* en } -enum CCSTATE { - CCS_VALUE, - CCS_RANGE, - CCS_COMPLETE, - CCS_START -}; +typedef enum { + CS_VALUE, + CS_RANGE, + CS_COMPLETE, + CS_START +} CSTATE; -enum CCVALTYPE { - CCV_SB, - CCV_CODE_POINT, - CCV_CLASS -}; +typedef enum { + CV_UNDEF, + CV_SB, + CV_MB, + CV_CPROP +} CVAL; static int -next_state_class(CClassNode* cc, OnigCodePoint* vs, enum CCVALTYPE* type, - enum CCSTATE* state, ScanEnv* env) +cc_cprop_next(CClassNode* cc, OnigCodePoint* pcode, CVAL* val, CSTATE* state, + ScanEnv* env) { int r; - if (*state == CCS_RANGE) + if (*state == CS_RANGE) return ONIGERR_CHAR_CLASS_VALUE_AT_END_OF_RANGE; - if (*state == CCS_VALUE && *type != CCV_CLASS) { - if (*type == CCV_SB) - BITSET_SET_BIT(cc->bs, (int )(*vs)); - else if (*type == CCV_CODE_POINT) { - r = add_code_range(&(cc->mbuf), env, *vs, *vs); + if (*state == CS_VALUE) { + if (*val == CV_SB) + BITSET_SET_BIT(cc->bs, (int )(*pcode)); + else if (*val == CV_MB) { + r = add_code_range(&(cc->mbuf), env, *pcode, *pcode); if (r < 0) return r; } } - *state = CCS_VALUE; - *type = CCV_CLASS; + *state = CS_VALUE; + *val = CV_CPROP; return 0; } static int -next_state_val(CClassNode* cc, OnigCodePoint *from, OnigCodePoint to, - int* from_israw, int to_israw, - enum CCVALTYPE intype, enum CCVALTYPE* type, - enum CCSTATE* state, ScanEnv* env) +cc_char_next(CClassNode* cc, OnigCodePoint *from, OnigCodePoint to, + int* from_raw, int to_raw, CVAL intype, CVAL* type, + CSTATE* state, ScanEnv* env) { int r; switch (*state) { - case CCS_VALUE: - if (*type == CCV_SB) { + case CS_VALUE: + if (*type == CV_SB) { if (*from > 0xff) return ONIGERR_INVALID_CODE_POINT_VALUE; BITSET_SET_BIT(cc->bs, (int )(*from)); } - else if (*type == CCV_CODE_POINT) { + else if (*type == CV_MB) { r = add_code_range(&(cc->mbuf), env, *from, *from); if (r < 0) return r; } break; - case CCS_RANGE: + case CS_RANGE: if (intype == *type) { - if (intype == CCV_SB) { + if (intype == CV_SB) { if (*from > 0xff || to > 0xff) return ONIGERR_INVALID_CODE_POINT_VALUE; @@ -6211,21 +6204,21 @@ next_state_val(CClassNode* cc, OnigCodePoint *from, OnigCodePoint to, if (r < 0) return r; } ccs_range_end: - *state = CCS_COMPLETE; + *state = CS_COMPLETE; break; - case CCS_COMPLETE: - case CCS_START: - *state = CCS_VALUE; + case CS_COMPLETE: + case CS_START: + *state = CS_VALUE; break; default: break; } - *from_israw = to_israw; - *from = to; - *type = intype; + *from_raw = to_raw; + *from = to; + *type = intype; return 0; } @@ -6253,27 +6246,25 @@ code_exist_check(OnigCodePoint c, UChar* from, UChar* end, int ignore_escaped, } static int -parse_char_class(Node** np, PToken* tok, UChar** src, UChar* end, ScanEnv* env) +parse_cc(Node** np, PToken* tok, UChar** src, UChar* end, ScanEnv* env) { int r, neg, len, fetched, and_start; - OnigCodePoint v, vs; + OnigCodePoint in_code, curr_code; UChar *p; Node* node; CClassNode *cc, *prev_cc; CClassNode work_cc; - - enum CCSTATE state; - enum CCVALTYPE val_type, in_type; - int val_israw, in_israw; + int curr_raw, in_raw; + CSTATE state; + CVAL in_type; + CVAL curr_type; *np = NULL_NODE; - env->parse_depth++; - if (env->parse_depth > ParseDepthLimit) - return ONIGERR_PARSE_DEPTH_LIMIT_OVER; + INC_PARSE_DEPTH(env->parse_depth); prev_cc = (CClassNode* )NULL; r = fetch_token_in_cc(tok, src, end, env); - if (r == TK_CHAR && tok->u.c == '^' && tok->escaped == 0) { + if (r == TK_CHAR && tok->u.code == (OnigCodePoint )'^' && tok->escaped == 0) { neg = 1; r = fetch_token_in_cc(tok, src, end, env); } @@ -6296,31 +6287,27 @@ parse_char_class(Node** np, PToken* tok, UChar** src, UChar* end, ScanEnv* env) cc = CCLASS_(node); and_start = 0; - state = CCS_START; + state = CS_START; + curr_type = CV_UNDEF; + p = *src; while (r != TK_CC_CLOSE) { fetched = 0; switch (r) { case TK_CHAR: any_char_in: - len = ONIGENC_CODE_TO_MBCLEN(env->enc, tok->u.c); - if (len > 1) { - in_type = CCV_CODE_POINT; - } - else if (len < 0) { + len = ONIGENC_CODE_TO_MBCLEN(env->enc, tok->u.code); + if (len < 0) { r = len; goto err; } - else { - /* sb_char: */ - in_type = CCV_SB; - } - v = (OnigCodePoint )tok->u.c; - in_israw = 0; + in_type = (len == 1) ? CV_SB : CV_MB; + in_code = tok->u.code; + in_raw = 0; goto val_entry2; break; - case TK_RAW_BYTE: + case TK_CRUDE_BYTE: /* tok->base != 0 : octal or hexadec. */ if (! ONIGENC_IS_SINGLEBYTE(env->enc) && tok->base != 0) { int i, j; @@ -6329,15 +6316,15 @@ parse_char_class(Node** np, PToken* tok, UChar** src, UChar* end, ScanEnv* env) UChar* psave = p; int base = tok->base; - buf[0] = tok->u.c; + buf[0] = tok->u.byte; for (i = 1; i < ONIGENC_MBC_MAXLEN(env->enc); i++) { r = fetch_token_in_cc(tok, &p, end, env); if (r < 0) goto err; - if (r != TK_RAW_BYTE || tok->base != base) { + if (r != TK_CRUDE_BYTE || tok->base != base) { fetched = 1; break; } - buf[i] = tok->u.c; + buf[i] = tok->u.byte; } if (i < ONIGENC_MBC_MINLEN(env->enc)) { @@ -6362,63 +6349,63 @@ parse_char_class(Node** np, PToken* tok, UChar** src, UChar* end, ScanEnv* env) } if (i == 1) { - v = (OnigCodePoint )buf[0]; - goto raw_single; + in_code = (OnigCodePoint )buf[0]; + goto crude_single; } else { - v = ONIGENC_MBC_TO_CODE(env->enc, buf, bufe); - in_type = CCV_CODE_POINT; + in_code = ONIGENC_MBC_TO_CODE(env->enc, buf, bufe); + in_type = CV_MB; } } else { - v = (OnigCodePoint )tok->u.c; - raw_single: - in_type = CCV_SB; + in_code = (OnigCodePoint )tok->u.byte; + crude_single: + in_type = CV_SB; } - in_israw = 1; + in_raw = 1; goto val_entry2; break; case TK_CODE_POINT: - v = tok->u.code; - in_israw = 1; + in_code = tok->u.code; + in_raw = 1; val_entry: - len = ONIGENC_CODE_TO_MBCLEN(env->enc, v); + len = ONIGENC_CODE_TO_MBCLEN(env->enc, in_code); if (len < 0) { - if (state != CCS_RANGE || + if (state != CS_RANGE || ! IS_SYNTAX_BV(env->syntax, ONIG_SYN_ALLOW_INVALID_CODE_END_OF_RANGE_IN_CC) || - v < 0x100 || ONIGENC_MBC_MAXLEN(env->enc) == 1) { + in_code < 0x100 || ONIGENC_MBC_MAXLEN(env->enc) == 1) { r = len; goto err; } } - in_type = (len == 1 ? CCV_SB : CCV_CODE_POINT); + in_type = (len == 1 ? CV_SB : CV_MB); val_entry2: - r = next_state_val(cc, &vs, v, &val_israw, in_israw, in_type, &val_type, - &state, env); + r = cc_char_next(cc, &curr_code, in_code, &curr_raw, in_raw, in_type, + &curr_type, &state, env); if (r != 0) goto err; break; - case TK_POSIX_BRACKET_OPEN: + case TK_CC_POSIX_BRACKET_OPEN: r = parse_posix_bracket(cc, &p, end, env); if (r < 0) goto err; if (r == 1) { /* is not POSIX bracket */ CC_ESC_WARN(env, (UChar* )"["); p = tok->backp; - v = (OnigCodePoint )tok->u.c; - in_israw = 0; + in_code = tok->u.code; + in_raw = 0; goto val_entry; } - goto next_class; + goto next_cprop; break; case TK_CHAR_TYPE: r = add_ctype_to_cc(cc, tok->u.prop.ctype, tok->u.prop.not, env); if (r != 0) goto err; - next_class: - r = next_state_class(cc, &vs, &val_type, &state, env); + next_cprop: + r = cc_cprop_next(cc, &curr_code, &curr_type, &state, env); if (r != 0) goto err; break; @@ -6431,19 +6418,20 @@ parse_char_class(Node** np, PToken* tok, UChar** src, UChar* end, ScanEnv* env) } r = add_ctype_to_cc(cc, ctype, tok->u.prop.not, env); if (r != 0) goto err; - goto next_class; + goto next_cprop; } break; case TK_CC_RANGE: - if (state == CCS_VALUE) { + if (state == CS_VALUE) { r = fetch_token_in_cc(tok, &p, end, env); if (r < 0) goto err; + fetched = 1; if (r == TK_CC_CLOSE) { /* allow [x-] */ range_end_val: - v = (OnigCodePoint )'-'; - in_israw = 0; + in_code = (OnigCodePoint )'-'; + in_raw = 0; goto val_entry; } else if (r == TK_CC_AND) { @@ -6451,20 +6439,21 @@ parse_char_class(Node** np, PToken* tok, UChar** src, UChar* end, ScanEnv* env) goto range_end_val; } - if (val_type == CCV_CLASS) { + if (curr_type == CV_CPROP) { r = ONIGERR_UNMATCHED_RANGE_SPECIFIER_IN_CHAR_CLASS; goto err; } - state = CCS_RANGE; + state = CS_RANGE; } - else if (state == CCS_START) { + else if (state == CS_START) { /* [-xa] is allowed */ - v = (OnigCodePoint )tok->u.c; - in_israw = 0; + in_code = tok->u.code; + in_raw = 0; r = fetch_token_in_cc(tok, &p, end, env); if (r < 0) goto err; + fetched = 1; /* [--x] or [a&&-x] is warned. */ if (r == TK_CC_RANGE || and_start != 0) @@ -6472,15 +6461,17 @@ parse_char_class(Node** np, PToken* tok, UChar** src, UChar* end, ScanEnv* env) goto val_entry; } - else if (state == CCS_RANGE) { + else if (state == CS_RANGE) { CC_ESC_WARN(env, (UChar* )"-"); - goto any_char_in; /* [!--x] is allowed */ + goto any_char_in; /* [!--] is allowed */ } - else { /* CCS_COMPLETE */ + else { /* CS_COMPLETE */ r = fetch_token_in_cc(tok, &p, end, env); if (r < 0) goto err; + fetched = 1; - if (r == TK_CC_CLOSE) goto range_end_val; /* allow [a-b-] */ + if (r == TK_CC_CLOSE) + goto range_end_val; /* allow [a-b-] */ else if (r == TK_CC_AND) { CC_ESC_WARN(env, (UChar* )"-"); goto range_end_val; @@ -6495,12 +6486,19 @@ parse_char_class(Node** np, PToken* tok, UChar** src, UChar* end, ScanEnv* env) } break; - case TK_CC_CC_OPEN: /* [ */ + case TK_CC_OPEN_CC: /* [ */ { Node *anode; CClassNode* acc; - r = parse_char_class(&anode, tok, &p, end, env); + if (state == CS_VALUE) { + r = cc_char_next(cc, &curr_code, 0, &curr_raw, 0, curr_type, &curr_type, + &state, env); + if (r != 0) goto err; + } + state = CS_COMPLETE; + + r = parse_cc(&anode, tok, &p, end, env); if (r != 0) { onig_node_free(anode); goto cc_open_err; @@ -6516,14 +6514,14 @@ parse_char_class(Node** np, PToken* tok, UChar** src, UChar* end, ScanEnv* env) case TK_CC_AND: /* && */ { - if (state == CCS_VALUE) { - r = next_state_val(cc, &vs, 0, &val_israw, 0, val_type, - &val_type, &state, env); + if (state == CS_VALUE) { + r = cc_char_next(cc, &curr_code, 0, &curr_raw, 0, curr_type, &curr_type, + &state, env); if (r != 0) goto err; } /* initialize local variables */ and_start = 1; - state = CCS_START; + state = CS_START; if (IS_NOT_NULL(prev_cc)) { r = and_cclass(prev_cc, cc, env->enc); @@ -6556,9 +6554,9 @@ parse_char_class(Node** np, PToken* tok, UChar** src, UChar* end, ScanEnv* env) } } - if (state == CCS_VALUE) { - r = next_state_val(cc, &vs, 0, &val_israw, 0, val_type, - &val_type, &state, env); + if (state == CS_VALUE) { + r = cc_char_next(cc, &curr_code, 0, &curr_raw, 0, curr_type, &curr_type, + &state, env); if (r != 0) goto err; } @@ -6591,7 +6589,7 @@ parse_char_class(Node** np, PToken* tok, UChar** src, UChar* end, ScanEnv* env) } } *src = p; - env->parse_depth--; + DEC_PARSE_DEPTH(env->parse_depth); return 0; err: @@ -6600,8 +6598,8 @@ parse_char_class(Node** np, PToken* tok, UChar** src, UChar* end, ScanEnv* env) return r; } -static int parse_subexp(Node** top, PToken* tok, int term, - UChar** src, UChar* end, ScanEnv* env, int group_head); +static int parse_alts(Node** top, PToken* tok, int term, + UChar** src, UChar* end, ScanEnv* env, int group_head); #ifdef USE_CALLOUT @@ -6772,7 +6770,8 @@ parse_long(OnigEncoding enc, UChar* s, UChar* end, int sign_on, long max, long* static int parse_callout_args(int skip_mode, int cterm, UChar** src, UChar* end, - unsigned int types[], OnigValue vals[], ScanEnv* env) + int max_arg_num, unsigned int types[], OnigValue vals[], + ScanEnv* env) { #define MAX_CALLOUT_ARG_BYTE_LENGTH 128 @@ -6791,9 +6790,9 @@ parse_callout_args(int skip_mode, int cterm, UChar** src, UChar* end, if (PEND) return ONIGERR_INVALID_CALLOUT_PATTERN; + c = 0; n = 0; while (n < ONIG_CALLOUT_MAX_ARGS_NUM) { - c = 0; cn = 0; esc = 0; eesc = 0; @@ -6826,7 +6825,7 @@ parse_callout_args(int skip_mode, int cterm, UChar** src, UChar* end, size_t clen; add_char: - if (skip_mode == 0) { + if (skip_mode == FALSE) { clen = p - e; if (bufend + clen > buf + MAX_CALLOUT_ARG_BYTE_LENGTH) return ONIGERR_INVALID_CALLOUT_ARG; /* too long argument */ @@ -6840,7 +6839,10 @@ parse_callout_args(int skip_mode, int cterm, UChar** src, UChar* end, } if (cn != 0) { - if (skip_mode == 0) { + if (max_arg_num >= 0 && n >= max_arg_num) + return ONIGERR_INVALID_CALLOUT_ARG; + + if (skip_mode == FALSE) { if ((types[n] & ONIG_TYPE_LONG) != 0) { int fixed = 0; if (cn > 0) { @@ -6972,7 +6974,7 @@ parse_callout_of_name(Node** np, int cterm, UChar** src, UChar* end, ScanEnv* en /* read for single check only */ save = p; - arg_num = parse_callout_args(1, '}', &p, end, 0, 0, env); + arg_num = parse_callout_args(TRUE, '}', &p, end, -1, NULL, NULL, env); if (arg_num < 0) return arg_num; is_not_single = PPEEK_IS(cterm) ? 0 : 1; @@ -6986,7 +6988,7 @@ parse_callout_of_name(Node** np, int cterm, UChar** src, UChar* end, ScanEnv* en types[i] = get_callout_arg_type_by_name_id(name_id, i); } - arg_num = parse_callout_args(0, '}', &p, end, types, vals, env); + arg_num = parse_callout_args(FALSE, '}', &p, end, max_arg_num, types, vals, env); if (arg_num < 0) return arg_num; if (PEND) return ONIGERR_END_PATTERN_IN_GROUP; @@ -7086,17 +7088,17 @@ parse_bag(Node** np, PToken* tok, int term, UChar** src, UChar* end, group: r = fetch_token(tok, &p, end, env); if (r < 0) return r; - r = parse_subexp(np, tok, term, &p, end, env, 0); + r = parse_alts(np, tok, term, &p, end, env, FALSE); if (r < 0) return r; *src = p; return 1; /* group */ break; case '=': - *np = onig_node_new_anchor(ANCR_PREC_READ, 0); + *np = onig_node_new_anchor(ANCR_PREC_READ, FALSE); break; case '!': /* preceding read */ - *np = onig_node_new_anchor(ANCR_PREC_READ_NOT, 0); + *np = onig_node_new_anchor(ANCR_PREC_READ_NOT, FALSE); break; case '>': /* (?>...) stop backtrack */ *np = node_new_bag(BAG_STOP_BACKTRACK); @@ -7114,9 +7116,9 @@ parse_bag(Node** np, PToken* tok, int term, UChar** src, UChar* end, if (PEND) return ONIGERR_END_PATTERN_WITH_UNMATCHED_PARENTHESIS; PFETCH(c); if (c == '=') - *np = onig_node_new_anchor(ANCR_LOOK_BEHIND, 0); + *np = onig_node_new_anchor(ANCR_LOOK_BEHIND, FALSE); else if (c == '!') - *np = onig_node_new_anchor(ANCR_LOOK_BEHIND_NOT, 0); + *np = onig_node_new_anchor(ANCR_LOOK_BEHIND_NOT, FALSE); else { if (IS_SYNTAX_OP2(env->syntax, ONIG_SYN_OP2_QMARK_LT_NAMED_GROUP)) { UChar *name; @@ -7132,7 +7134,7 @@ parse_bag(Node** np, PToken* tok, int term, UChar** src, UChar* end, named_group2: name = p; r = fetch_name((OnigCodePoint )c, &p, end, &name_end, env, &num, - &num_type, 0); + &num_type, FALSE); if (r < 0) return r; num = scan_env_add_mem_entry(env); @@ -7146,7 +7148,7 @@ parse_bag(Node** np, PToken* tok, int term, UChar** src, UChar* end, CHECK_NULL_RETURN_MEMERR(*np); BAG_(*np)->m.regnum = num; if (list_capture != 0) - MEM_STATUS_ON_SIMPLE(env->capture_history, num); + MEM_STATUS_ON_SIMPLE(env->cap_history, num); env->num_named++; } else { @@ -7181,7 +7183,7 @@ parse_bag(Node** np, PToken* tok, int term, UChar** src, UChar* end, r = fetch_token(tok, &p, end, env); if (r < 0) return r; - r = parse_subexp(&absent, tok, term, &p, end, env, 1); + r = parse_alts(&absent, tok, term, &p, end, env, TRUE); if (r < 0) { onig_node_free(absent); return r; @@ -7268,7 +7270,7 @@ parse_bag(Node** np, PToken* tok, int term, UChar** src, UChar* end, if (r == 1) exist_level = 1; #else r = fetch_name((OnigCodePoint )(is_enclosed != 0 ? c : '('), - &p, end, &name_end, env, &back_num, &num_type, 1); + &p, end, &name_end, env, &back_num, &num_type, TRUE); #endif if (r < 0) { if (is_enclosed == 0) { @@ -7288,11 +7290,11 @@ parse_bag(Node** np, PToken* tok, int term, UChar** src, UChar* end, if (IS_SYNTAX_BV(env->syntax, ONIG_SYN_STRICT_CHECK_BACKREF)) { if (back_num > env->num_mem || - IS_NULL(SCANENV_MEMENV(env)[back_num].node)) + IS_NULL(SCANENV_MEMENV(env)[back_num].mem_node)) return ONIGERR_INVALID_BACKREF; } - condition = node_new_backref_checker(1, &back_num, 0, + condition = node_new_backref_checker(1, &back_num, FALSE, #ifdef USE_BACKREF_WITH_LEVEL exist_level, level, #endif @@ -7310,12 +7312,12 @@ parse_bag(Node** np, PToken* tok, int term, UChar** src, UChar* end, int i; for (i = 0; i < num; i++) { if (backs[i] > env->num_mem || - IS_NULL(SCANENV_MEMENV(env)[backs[i]].node)) + IS_NULL(SCANENV_MEMENV(env)[backs[i]].mem_node)) return ONIGERR_INVALID_BACKREF; } } - condition = node_new_backref_checker(num, backs, 1, + condition = node_new_backref_checker(num, backs, TRUE, #ifdef USE_BACKREF_WITH_LEVEL exist_level, level, #endif @@ -7357,7 +7359,7 @@ parse_bag(Node** np, PToken* tok, int term, UChar** src, UChar* end, condition_is_checker = 0; r = fetch_token(tok, &p, end, env); if (r < 0) return r; - r = parse_subexp(&condition, tok, term, &p, end, env, 0); + r = parse_alts(&condition, tok, term, &p, end, env, FALSE); if (r < 0) { onig_node_free(condition); return r; @@ -7400,7 +7402,7 @@ parse_bag(Node** np, PToken* tok, int term, UChar** src, UChar* end, onig_node_free(condition); return r; } - r = parse_subexp(&target, tok, term, &p, end, env, 1); + r = parse_alts(&target, tok, term, &p, end, env, TRUE); if (r < 0) { onig_node_free(condition); onig_node_free(target); @@ -7465,7 +7467,7 @@ parse_bag(Node** np, PToken* tok, int term, UChar** src, UChar* end, return ONIGERR_GROUP_NUMBER_OVER_FOR_CAPTURE_HISTORY; } BAG_(*np)->m.regnum = num; - MEM_STATUS_ON_SIMPLE(env->capture_history, num); + MEM_STATUS_ON_SIMPLE(env->cap_history, num); } else { return ONIGERR_UNDEFINED_GROUP_OPTION; @@ -7501,7 +7503,7 @@ parse_bag(Node** np, PToken* tok, int term, UChar** src, UChar* end, case 'm': if (IS_SYNTAX_OP2(env->syntax, ONIG_SYN_OP2_OPTION_PERL)) { - OPTION_NEGATE(option, ONIG_OPTION_SINGLELINE, (neg == 0 ? 1 : 0)); + OPTION_NEGATE(option, ONIG_OPTION_SINGLELINE, (neg == 0 ? TRUE : FALSE)); } else if (IS_SYNTAX_OP2(env->syntax, ONIG_SYN_OP2_OPTION_ONIGURUMA|ONIG_SYN_OP2_OPTION_RUBY)) { @@ -7537,16 +7539,16 @@ parse_bag(Node** np, PToken* tok, int term, UChar** src, UChar* end, if (! ONIGENC_IS_UNICODE_ENCODING(enc)) return ONIGERR_UNDEFINED_GROUP_OPTION; - OPTION_NEGATE(option, ONIG_OPTION_TEXT_SEGMENT_EXTENDED_GRAPHEME_CLUSTER, 0); - OPTION_NEGATE(option, ONIG_OPTION_TEXT_SEGMENT_WORD, 1); + OPTION_NEGATE(option, ONIG_OPTION_TEXT_SEGMENT_EXTENDED_GRAPHEME_CLUSTER, FALSE); + OPTION_NEGATE(option, ONIG_OPTION_TEXT_SEGMENT_WORD, TRUE); break; #ifdef USE_UNICODE_WORD_BREAK case 'w': if (! ONIGENC_IS_UNICODE_ENCODING(enc)) return ONIGERR_UNDEFINED_GROUP_OPTION; - OPTION_NEGATE(option, ONIG_OPTION_TEXT_SEGMENT_WORD, 0); - OPTION_NEGATE(option, ONIG_OPTION_TEXT_SEGMENT_EXTENDED_GRAPHEME_CLUSTER, 1); + OPTION_NEGATE(option, ONIG_OPTION_TEXT_SEGMENT_WORD, FALSE); + OPTION_NEGATE(option, ONIG_OPTION_TEXT_SEGMENT_EXTENDED_GRAPHEME_CLUSTER, TRUE); break; #endif default: @@ -7576,7 +7578,7 @@ parse_bag(Node** np, PToken* tok, int term, UChar** src, UChar* end, env->options = option; r = fetch_token(tok, &p, end, env); if (r < 0) return r; - r = parse_subexp(&target, tok, term, &p, end, env, 0); + r = parse_alts(&target, tok, term, &p, end, env, FALSE); env->options = prev; if (r < 0) { onig_node_free(target); @@ -7623,7 +7625,7 @@ parse_bag(Node** np, PToken* tok, int term, UChar** src, UChar* end, CHECK_NULL_RETURN_MEMERR(*np); r = fetch_token(tok, &p, end, env); if (r < 0) return r; - r = parse_subexp(&target, tok, term, &p, end, env, 0); + r = parse_alts(&target, tok, term, &p, end, env, FALSE); if (r < 0) { onig_node_free(target); return r; @@ -7633,7 +7635,7 @@ parse_bag(Node** np, PToken* tok, int term, UChar** src, UChar* end, if (NODE_TYPE(*np) == NODE_BAG) { if (BAG_(*np)->type == BAG_MEMORY) { - /* Don't move this to previous of parse_subexp() */ + /* Don't move this to previous of parse_alts() */ r = scan_env_set_mem_node(env, BAG_(*np)->m.regnum, *np); if (r != 0) return r; } @@ -7653,7 +7655,7 @@ static const char* ReduceQStr[] = { }; static int -set_quantifier(Node* qnode, Node* target, int group, ScanEnv* env) +assign_quantifier_body(Node* qnode, Node* target, int group, ScanEnv* env) { QuantNode* qn; @@ -7725,9 +7727,11 @@ set_quantifier(Node* qnode, Node* target, int group, ScanEnv* env) } } else { + int r; + NODE_BODY(qnode) = target; - onig_reduce_nested_quantifier(qnode, target); - goto q_exit; + r = onig_reduce_nested_quantifier(qnode); + return r; } } break; @@ -7737,7 +7741,6 @@ set_quantifier(Node* qnode, Node* target, int group, ScanEnv* env) } NODE_BODY(qnode) = target; - q_exit: return 0; } @@ -7767,6 +7770,38 @@ clear_not_flag_cclass(CClassNode* cc, OnigEncoding enc) } #endif /* CASE_FOLD_IS_APPLIED_INSIDE_NEGATIVE_CCLASS */ +#define ADD_CODE_INTO_CC(cc, code, enc) do {\ + if (ONIGENC_MBC_MINLEN(enc) > 1 || ONIGENC_CODE_TO_MBCLEN(enc, code) != 1) {\ + add_code_range_to_buf(&((cc)->mbuf), code, code);\ + }\ + else {\ + BITSET_SET_BIT((cc)->bs, code);\ + }\ +} while (0) + +extern int +onig_new_cclass_with_code_list(Node** rnode, OnigEncoding enc, + int n, OnigCodePoint codes[]) +{ + int i; + Node* node; + CClassNode* cc; + + *rnode = NULL_NODE; + + node = node_new_cclass(); + CHECK_NULL_RETURN_MEMERR(node); + + cc = CCLASS_(node); + + for (i = 0; i < n; i++) { + ADD_CODE_INTO_CC(cc, codes[i], enc); + } + + *rnode = node; + return 0; +} + typedef struct { ScanEnv* env; CClassNode* cc; @@ -7780,37 +7815,31 @@ i_apply_case_fold(OnigCodePoint from, OnigCodePoint to[], int to_len, void* arg) IApplyCaseFoldArg* iarg; ScanEnv* env; CClassNode* cc; - BitSetRef bs; iarg = (IApplyCaseFoldArg* )arg; env = iarg->env; cc = iarg->cc; - bs = cc->bs; if (to_len == 1) { int is_in = onig_is_code_in_cc(env->enc, from, cc); #ifdef CASE_FOLD_IS_APPLIED_INSIDE_NEGATIVE_CCLASS if ((is_in != 0 && !IS_NCCLASS_NOT(cc)) || (is_in == 0 && IS_NCCLASS_NOT(cc))) { - if (ONIGENC_MBC_MINLEN(env->enc) > 1 || *to >= SINGLE_BYTE_SIZE) { - add_code_range(&(cc->mbuf), env, *to, *to); - } - else { - BITSET_SET_BIT(bs, *to); - } + ADD_CODE_INTO_CC(cc, *to, env->enc); } #else if (is_in != 0) { - if (ONIGENC_MBC_MINLEN(env->enc) > 1 || *to >= SINGLE_BYTE_SIZE) { + if (ONIGENC_MBC_MINLEN(env->enc) > 1 || + ONIGENC_CODE_TO_MBCLEN(env->enc, *to) != 1) { if (IS_NCCLASS_NOT(cc)) clear_not_flag_cclass(cc, env->enc); add_code_range(&(cc->mbuf), env, *to, *to); } else { if (IS_NCCLASS_NOT(cc)) { - BITSET_CLEAR_BIT(bs, *to); + BITSET_CLEAR_BIT(cc->bs, *to); } else - BITSET_SET_BIT(bs, *to); + BITSET_SET_BIT(cc->bs, *to); } } #endif /* CASE_FOLD_IS_APPLIED_INSIDE_NEGATIVE_CCLASS */ @@ -7818,34 +7847,65 @@ i_apply_case_fold(OnigCodePoint from, OnigCodePoint to[], int to_len, void* arg) else { int r, i, len; UChar buf[ONIGENC_CODE_TO_MBC_MAXLEN]; - Node *snode = NULL_NODE; if (onig_is_code_in_cc(env->enc, from, cc) #ifdef CASE_FOLD_IS_APPLIED_INSIDE_NEGATIVE_CCLASS && !IS_NCCLASS_NOT(cc) #endif ) { + int n, j, m, index; + Node* list_node; + Node* ns[3]; + + n = 0; for (i = 0; i < to_len; i++) { - len = ONIGENC_CODE_TO_MBC(env->enc, to[i], buf); - if (i == 0) { - snode = onig_node_new_str(buf, buf + len); - CHECK_NULL_RETURN_MEMERR(snode); - - /* char-class expanded multi-char only - compare with string folded at match time. */ - NODE_STRING_SET_AMBIG(snode); + OnigCodePoint code; + Node* csnode; + CClassNode* cs_cc; + + index = onigenc_unicode_fold1_key(&to[i]); + if (index >= 0) { + csnode = node_new_cclass(); + cs_cc = CCLASS_(csnode); + if (IS_NULL(csnode)) { + err_free_ns: + for (j = 0; j < n; j++) onig_node_free(ns[j]); + return ONIGERR_MEMORY; + } + m = FOLDS1_UNFOLDS_NUM(index); + for (j = 0; j < m; j++) { + code = FOLDS1_UNFOLDS(index)[j]; + ADD_CODE_INTO_CC(cs_cc, code, env->enc); + } + ADD_CODE_INTO_CC(cs_cc, to[i], env->enc); + ns[n++] = csnode; } else { - r = onig_node_str_cat(snode, buf, buf + len); - if (r < 0) { - onig_node_free(snode); - return r; + len = ONIGENC_CODE_TO_MBC(env->enc, to[i], buf); + if (n == 0 || NODE_TYPE(ns[n-1]) != NODE_STRING) { + csnode = onig_node_new_str(buf, buf + len); + if (IS_NULL(csnode)) goto err_free_ns; + + NODE_STRING_SET_CASE_EXPANDED(csnode); + ns[n++] = csnode; + } + else { + r = onig_node_str_cat(ns[n-1], buf, buf + len); + if (r < 0) goto err_free_ns; } } } - *(iarg->ptail) = onig_node_new_alt(snode, NULL_NODE); - CHECK_NULL_RETURN_MEMERR(*(iarg->ptail)); + if (n == 1) + list_node = ns[0]; + else + list_node = make_list(n, ns); + + *(iarg->ptail) = onig_node_new_alt(list_node, NULL_NODE); + if (IS_NULL(*(iarg->ptail))) { + onig_node_free(list_node); + return ONIGERR_MEMORY; + } iarg->ptail = &(NODE_CDR((*(iarg->ptail)))); } } @@ -7901,7 +7961,7 @@ parse_exp(Node** np, PToken* tok, int term, UChar** src, UChar* end, env->options = BAG_(*np)->o.options; r = fetch_token(tok, src, end, env); if (r < 0) return r; - r = parse_subexp(&target, tok, term, src, end, env, 0); + r = parse_alts(&target, tok, term, src, end, env, FALSE); env->options = prev; if (r < 0) { onig_node_free(target); @@ -7916,7 +7976,7 @@ parse_exp(Node** np, PToken* tok, int term, UChar** src, UChar* end, if (! IS_SYNTAX_BV(env->syntax, ONIG_SYN_ALLOW_UNMATCHED_CLOSE_SUBEXP)) return ONIGERR_UNMATCHED_CLOSE_PARENTHESIS; - if (tok->escaped) goto tk_raw_byte; + if (tok->escaped) goto tk_crude_byte; else goto tk_byte; break; @@ -7941,36 +8001,36 @@ parse_exp(Node** np, PToken* tok, int term, UChar** src, UChar* end, } break; - case TK_RAW_BYTE: - tk_raw_byte: + case TK_CRUDE_BYTE: + tk_crude_byte: { - *np = node_new_str_raw_char((UChar )tok->u.c); + *np = node_new_str_crude_char(tok->u.byte); CHECK_NULL_RETURN_MEMERR(*np); len = 1; while (1) { if (len >= ONIGENC_MBC_MINLEN(env->enc)) { if (len == enclen(env->enc, STR_(*np)->s)) { r = fetch_token(tok, src, end, env); - goto tk_raw_byte_end; + goto tk_crude_byte_end; } } r = fetch_token(tok, src, end, env); if (r < 0) return r; - if (r != TK_RAW_BYTE) + if (r != TK_CRUDE_BYTE) return ONIGERR_TOO_SHORT_MULTI_BYTE_STRING; - r = node_str_cat_char(*np, (UChar )tok->u.c); + r = node_str_cat_char(*np, tok->u.byte); if (r < 0) return r; len++; } - tk_raw_byte_end: + tk_crude_byte_end: if (! ONIGENC_IS_VALID_MBC_STRING(env->enc, STR_(*np)->s, STR_(*np)->end)) return ONIGERR_INVALID_WIDE_CHAR_VALUE; - NODE_STRING_CLEAR_RAW(*np); + NODE_STRING_CLEAR_CRUDE(*np); goto string_end; } break; @@ -7981,7 +8041,7 @@ parse_exp(Node** np, PToken* tok, int term, UChar** src, UChar* end, len = ONIGENC_CODE_TO_MBC(env->enc, tok->u.code, buf); if (len < 0) return len; #ifdef NUMBERED_CHAR_IS_NOT_CASE_AMBIG - *np = node_new_str_raw(buf, buf + len); + *np = node_new_str_crude(buf, buf + len); #else *np = node_new_str(buf, buf + len); #endif @@ -8024,7 +8084,7 @@ parse_exp(Node** np, PToken* tok, int term, UChar** src, UChar* end, *np = node_new_cclass(); CHECK_NULL_RETURN_MEMERR(*np); cc = CCLASS_(*np); - add_ctype_to_cc(cc, tok->u.prop.ctype, 0, env); + add_ctype_to_cc(cc, tok->u.prop.ctype, FALSE, env); if (tok->u.prop.not != 0) NCCLASS_SET_NOT(cc); } break; @@ -8041,11 +8101,11 @@ parse_exp(Node** np, PToken* tok, int term, UChar** src, UChar* end, if (r != 0) return r; break; - case TK_CC_OPEN: + case TK_OPEN_CC: { CClassNode* cc; - r = parse_char_class(np, tok, src, end, env); + r = parse_cc(np, tok, src, end, env); if (r != 0) return r; cc = CCLASS_(*np); @@ -8083,7 +8143,7 @@ parse_exp(Node** np, PToken* tok, int term, UChar** src, UChar* end, case TK_ANYCHAR_ANYTIME: *np = node_new_anychar(); CHECK_NULL_RETURN_MEMERR(*np); - qn = node_new_quantifier(0, INFINITE_REPEAT, 0); + qn = node_new_quantifier(0, INFINITE_REPEAT, FALSE); CHECK_NULL_RETURN_MEMERR(qn); NODE_BODY(qn) = *np; *np = qn; @@ -8186,9 +8246,7 @@ parse_exp(Node** np, PToken* tok, int term, UChar** src, UChar* end, if (is_invalid_quantifier_target(*tp)) return ONIGERR_TARGET_OF_REPEAT_OPERATOR_INVALID; - parse_depth++; - if (parse_depth > ParseDepthLimit) - return ONIGERR_PARSE_DEPTH_LIMIT_OVER; + INC_PARSE_DEPTH(parse_depth); qn = node_new_quantifier(tok->u.repeat.lower, tok->u.repeat.upper, r == TK_INTERVAL); @@ -8201,9 +8259,10 @@ parse_exp(Node** np, PToken* tok, int term, UChar** src, UChar* end, else { target = *tp; } - r = set_quantifier(qn, target, group, env); + r = assign_quantifier_body(qn, target, group, env); if (r < 0) { onig_node_free(qn); + *tp = NULL_NODE; return r; } @@ -8256,6 +8315,8 @@ parse_branch(Node** top, PToken* tok, int term, UChar** src, UChar* end, Node *node, **headp; *top = NULL; + INC_PARSE_DEPTH(env->parse_depth); + r = parse_exp(&node, tok, term, src, end, env, group_head); if (r < 0) { onig_node_free(node); @@ -8266,7 +8327,7 @@ parse_branch(Node** top, PToken* tok, int term, UChar** src, UChar* end, *top = node; } else { - *top = node_new_list(node, NULL); + *top = node_new_list(node, NULL); if (IS_NULL(*top)) { onig_node_free(node); return ONIGERR_MEMORY; @@ -8274,7 +8335,7 @@ parse_branch(Node** top, PToken* tok, int term, UChar** src, UChar* end, headp = &(NODE_CDR(*top)); while (r != TK_EOT && r != term && r != TK_ALT) { - r = parse_exp(&node, tok, term, src, end, env, 0); + r = parse_exp(&node, tok, term, src, end, env, FALSE); if (r < 0) { onig_node_free(node); return r; @@ -8292,21 +8353,20 @@ parse_branch(Node** top, PToken* tok, int term, UChar** src, UChar* end, } } + DEC_PARSE_DEPTH(env->parse_depth); return r; } /* term_tok: TK_EOT or TK_SUBEXP_CLOSE */ static int -parse_subexp(Node** top, PToken* tok, int term, UChar** src, UChar* end, - ScanEnv* env, int group_head) +parse_alts(Node** top, PToken* tok, int term, UChar** src, UChar* end, + ScanEnv* env, int group_head) { int r; Node *node, **headp; *top = NULL; - env->parse_depth++; - if (env->parse_depth > ParseDepthLimit) - return ONIGERR_PARSE_DEPTH_LIMIT_OVER; + INC_PARSE_DEPTH(env->parse_depth); r = parse_branch(&node, tok, term, src, end, env, group_head); if (r < 0) { @@ -8328,7 +8388,7 @@ parse_subexp(Node** top, PToken* tok, int term, UChar** src, UChar* end, while (r == TK_ALT) { r = fetch_token(tok, src, end, env); if (r < 0) return r; - r = parse_branch(&node, tok, term, src, end, env, 0); + r = parse_branch(&node, tok, term, src, end, env, FALSE); if (r < 0) { onig_node_free(node); return r; @@ -8355,7 +8415,7 @@ parse_subexp(Node** top, PToken* tok, int term, UChar** src, UChar* end, return ONIGERR_PARSER_BUG; } - env->parse_depth--; + DEC_PARSE_DEPTH(env->parse_depth); return r; } @@ -8367,7 +8427,7 @@ parse_regexp(Node** top, UChar** src, UChar* end, ScanEnv* env) r = fetch_token(&tok, src, end, env); if (r < 0) return r; - r = parse_subexp(top, &tok, TK_EOT, src, end, env, 0); + r = parse_alts(top, &tok, TK_EOT, src, end, env, FALSE); if (r < 0) return r; return 0; diff --git a/src/regparse.h b/src/regparse.h index 231f7b5..1525ccb 100644 --- a/src/regparse.h +++ b/src/regparse.h @@ -4,7 +4,7 @@ regparse.h - Oniguruma (regular expression library) **********************************************************************/ /*- - * Copyright (c) 2002-2019 K.Kosako <sndgk393 AT ybb DOT ne DOT jp> + * Copyright (c) 2002-2019 K.Kosako * All rights reserved. * * Redistribution and use in source and binary forms, with or without @@ -32,7 +32,7 @@ #include "regint.h" #define NODE_STRING_MARGIN 16 -#define NODE_STRING_BUF_SIZE 24 /* sizeof(CClassNode) - sizeof(int)*4 */ +#define NODE_STRING_BUF_SIZE 20 /* sizeof(CClassNode) - sizeof(int)*4 */ #define NODE_BACKREFS_SIZE 6 /* node type */ @@ -73,20 +73,25 @@ enum BodyEmptyType { BODY_IS_EMPTY_POSSIBILITY_REC = 3 }; +struct _Node; + typedef struct { NodeType node_type; int status; + struct _Node* parent; UChar* s; UChar* end; unsigned int flag; - int capacity; /* (allocated size - 1) or 0: use buf[] */ UChar buf[NODE_STRING_BUF_SIZE]; + int capacity; /* (allocated size - 1) or 0: use buf[] */ + int case_min_len; } StrNode; typedef struct { NodeType node_type; int status; + struct _Node* parent; unsigned int flags; BitSet bs; @@ -96,6 +101,7 @@ typedef struct { typedef struct { NodeType node_type; int status; + struct _Node* parent; struct _Node* body; int lower; @@ -104,12 +110,13 @@ typedef struct { enum BodyEmptyType emptiness; struct _Node* head_exact; struct _Node* next_head_exact; - int is_refered; /* include called node. don't eliminate even if {0} */ + int include_referred; /* include called node. don't eliminate even if {0} */ } QuantNode; typedef struct { NodeType node_type; int status; + struct _Node* parent; struct _Node* body; enum BagType type; @@ -152,6 +159,7 @@ typedef struct { typedef struct { NodeType node_type; int status; + struct _Node* parent; struct _Node* body; /* to BagNode : BAG_MEMORY */ int by_number; @@ -166,6 +174,7 @@ typedef struct { typedef struct { NodeType node_type; int status; + struct _Node* parent; int back_num; int back_static[NODE_BACKREFS_SIZE]; @@ -176,6 +185,7 @@ typedef struct { typedef struct { NodeType node_type; int status; + struct _Node* parent; struct _Node* body; int type; @@ -186,6 +196,7 @@ typedef struct { typedef struct { NodeType node_type; int status; + struct _Node* parent; struct _Node* car; struct _Node* cdr; @@ -194,6 +205,7 @@ typedef struct { typedef struct { NodeType node_type; int status; + struct _Node* parent; int ctype; int not; @@ -204,6 +216,7 @@ typedef struct { typedef struct { NodeType node_type; int status; + struct _Node* parent; enum GimmickType type; int detail_type; @@ -216,6 +229,7 @@ typedef struct _Node { struct { NodeType node_type; int status; + struct _Node* parent; struct _Node* body; } base; @@ -280,26 +294,21 @@ typedef struct _Node { #define ANCR_ANYCHAR_INF_MASK (ANCR_ANYCHAR_INF | ANCR_ANYCHAR_INF_ML) #define ANCR_END_BUF_MASK (ANCR_END_BUF | ANCR_SEMI_END_BUF) -#define NODE_STRING_RAW (1<<0) /* by backslashed number */ -#define NODE_STRING_AMBIG (1<<1) -#define NODE_STRING_GOOD_AMBIG (1<<2) -#define NODE_STRING_DONT_GET_OPT_INFO (1<<3) +#define NODE_STRING_CRUDE (1<<0) +#define NODE_STRING_CASE_EXPANDED (1<<1) +#define NODE_STRING_CASE_FOLD_MATCH (1<<2) #define NODE_STRING_LEN(node) (int )((node)->u.str.end - (node)->u.str.s) -#define NODE_STRING_SET_RAW(node) (node)->u.str.flag |= NODE_STRING_RAW -#define NODE_STRING_CLEAR_RAW(node) (node)->u.str.flag &= ~NODE_STRING_RAW -#define NODE_STRING_SET_AMBIG(node) (node)->u.str.flag |= NODE_STRING_AMBIG -#define NODE_STRING_SET_GOOD_AMBIG(node) (node)->u.str.flag |= NODE_STRING_GOOD_AMBIG -#define NODE_STRING_SET_DONT_GET_OPT_INFO(node) \ - (node)->u.str.flag |= NODE_STRING_DONT_GET_OPT_INFO -#define NODE_STRING_IS_RAW(node) \ - (((node)->u.str.flag & NODE_STRING_RAW) != 0) -#define NODE_STRING_IS_AMBIG(node) \ - (((node)->u.str.flag & NODE_STRING_AMBIG) != 0) -#define NODE_STRING_IS_GOOD_AMBIG(node) \ - (((node)->u.str.flag & NODE_STRING_GOOD_AMBIG) != 0) -#define NODE_STRING_IS_DONT_GET_OPT_INFO(node) \ - (((node)->u.str.flag & NODE_STRING_DONT_GET_OPT_INFO) != 0) +#define NODE_STRING_SET_CRUDE(node) (node)->u.str.flag |= NODE_STRING_CRUDE +#define NODE_STRING_CLEAR_CRUDE(node) (node)->u.str.flag &= ~NODE_STRING_CRUDE +#define NODE_STRING_SET_CASE_EXPANDED(node) (node)->u.str.flag |= NODE_STRING_CASE_EXPANDED +#define NODE_STRING_SET_CASE_FOLD_MATCH(node) (node)->u.str.flag |= NODE_STRING_CASE_FOLD_MATCH +#define NODE_STRING_IS_CRUDE(node) \ + (((node)->u.str.flag & NODE_STRING_CRUDE) != 0) +#define NODE_STRING_IS_CASE_EXPANDED(node) \ + (((node)->u.str.flag & NODE_STRING_CASE_EXPANDED) != 0) +#define NODE_STRING_IS_CASE_FOLD_MATCH(node) \ + (((node)->u.str.flag & NODE_STRING_CASE_FOLD_MATCH) != 0) #define BACKREFS_P(br) \ (IS_NOT_NULL((br)->back_dynamic) ? (br)->back_dynamic : (br)->back_static) @@ -326,6 +335,7 @@ typedef struct _Node { #define NODE_ST_FIXED_OPTION (1<<18) #define NODE_ST_PROHIBIT_RECURSION (1<<19) #define NODE_ST_SUPER (1<<20) +#define NODE_ST_EMPTY_STATUS_CHECK (1<<21) #define NODE_STATUS(node) (((Node* )node)->u.base.status) @@ -355,7 +365,10 @@ typedef struct _Node { ((NODE_STATUS(node) & NODE_ST_PROHIBIT_RECURSION) != 0) #define NODE_IS_STRICT_REAL_REPEAT(node) \ ((NODE_STATUS(node) & NODE_ST_STRICT_REAL_REPEAT) != 0) +#define NODE_IS_EMPTY_STATUS_CHECK(node) \ + ((NODE_STATUS(node) & NODE_ST_EMPTY_STATUS_CHECK) != 0) +#define NODE_PARENT(node) ((node)->u.base.parent) #define NODE_BODY(node) ((node)->u.base.body) #define NODE_QUANT_BODY(node) ((node)->body) #define NODE_BAG_BODY(node) ((node)->body) @@ -368,11 +381,8 @@ typedef struct _Node { (senv)->mem_env_dynamic : (senv)->mem_env_static) typedef struct { - Node* node; -#if 0 - int in; - int recursion; -#endif + Node* mem_node; + Node* empty_repeat_node; } MemEnv; typedef struct { @@ -384,9 +394,8 @@ typedef struct { OnigCaseFoldType case_fold_flag; OnigEncoding enc; OnigSyntaxType* syntax; - MemStatusType capture_history; - MemStatusType bt_mem_start; - MemStatusType bt_mem_end; + MemStatusType cap_history; + MemStatusType backtrack_mem; /* backtrack/recursion */ MemStatusType backrefed_mem; UChar* pattern; UChar* pattern_end; @@ -404,7 +413,10 @@ typedef struct { MemEnv mem_env_static[SCANENV_MEMENV_SIZE]; MemEnv* mem_env_dynamic; unsigned int parse_depth; - +#ifdef ONIG_DEBUG_PARSE + unsigned int max_parse_depth; +#endif + int backref_num; int keep_num; int save_num; int save_alloc_num; @@ -425,9 +437,7 @@ extern int onig_renumber_name_table P_((regex_t* reg, GroupNumRemap* map)); extern int onig_strncmp P_((const UChar* s1, const UChar* s2, int n)); extern void onig_strcpy P_((UChar* dest, const UChar* src, const UChar* end)); extern void onig_scan_env_set_error_string P_((ScanEnv* env, int ecode, UChar* arg, UChar* arg_end)); -extern int onig_scan_unsigned_number P_((UChar** src, const UChar* end, OnigEncoding enc)); -extern void onig_reduce_nested_quantifier P_((Node* pnode, Node* cnode)); -extern void onig_node_conv_to_str_node P_((Node* node, int raw)); +extern int onig_reduce_nested_quantifier P_((Node* pnode)); extern int onig_node_str_cat P_((Node* node, const UChar* s, const UChar* end)); extern int onig_node_str_set P_((Node* node, const UChar* s, const UChar* end)); extern void onig_node_free P_((Node* node)); @@ -435,13 +445,13 @@ extern Node* onig_node_new_bag P_((enum BagType type)); extern Node* onig_node_new_anchor P_((int type, int ascii_mode)); extern Node* onig_node_new_str P_((const UChar* s, const UChar* end)); extern Node* onig_node_new_list P_((Node* left, Node* right)); -extern Node* onig_node_list_add P_((Node* list, Node* x)); extern Node* onig_node_new_alt P_((Node* left, Node* right)); extern void onig_node_str_clear P_((Node* node)); extern int onig_names_free P_((regex_t* reg)); extern int onig_parse_tree P_((Node** root, const UChar* pattern, const UChar* end, regex_t* reg, ScanEnv* env)); extern int onig_free_shared_cclass_table P_((void)); extern int onig_is_code_in_cc P_((OnigEncoding enc, OnigCodePoint code, CClassNode* cc)); +extern int onig_new_cclass_with_code_list(Node** rnode, OnigEncoding enc, int n, OnigCodePoint codes[]); extern OnigLen onig_get_tiny_min_len(Node* node, unsigned int inhibit_node_types, int* invalid_node); #ifdef USE_CALLOUT diff --git a/src/regposerr.c b/src/regposerr.c index e389531..e1747c5 100644 --- a/src/regposerr.c +++ b/src/regposerr.c @@ -2,7 +2,7 @@ regposerr.c - Oniguruma (regular expression library) **********************************************************************/ /*- - * Copyright (c) 2002-2019 K.Kosako <sndgk393 AT ybb DOT ne DOT jp> + * Copyright (c) 2002-2019 K.Kosako * All rights reserved. * * Redistribution and use in source and binary forms, with or without diff --git a/src/regposix.c b/src/regposix.c index 09e16ac..b3e78ff 100644 --- a/src/regposix.c +++ b/src/regposix.c @@ -2,7 +2,7 @@ regposix.c - Oniguruma (regular expression library) **********************************************************************/ /*- - * Copyright (c) 2002-2019 K.Kosako <sndgk393 AT ybb DOT ne DOT jp> + * Copyright (c) 2002-2019 K.Kosako * All rights reserved. * * Redistribution and use in source and binary forms, with or without diff --git a/src/regsyntax.c b/src/regsyntax.c index d4420cc..513c7f7 100644 --- a/src/regsyntax.c +++ b/src/regsyntax.c @@ -2,7 +2,7 @@ regsyntax.c - Oniguruma (regular expression library) **********************************************************************/ /*- - * Copyright (c) 2002-2019 K.Kosako <sndgk393 AT ybb DOT ne DOT jp> + * Copyright (c) 2002-2019 K.Kosako * All rights reserved. * * Redistribution and use in source and binary forms, with or without diff --git a/src/regtrav.c b/src/regtrav.c index 58a17f5..8307695 100644 --- a/src/regtrav.c +++ b/src/regtrav.c @@ -2,7 +2,7 @@ regtrav.c - Oniguruma (regular expression library) **********************************************************************/ /*- - * Copyright (c) 2002-2004 K.Kosako <sndgk393 AT ybb DOT ne DOT jp> + * Copyright (c) 2002-2019 K.Kosako * All rights reserved. * * Redistribution and use in source and binary forms, with or without diff --git a/src/regversion.c b/src/regversion.c index 594a52c..de993d3 100644 --- a/src/regversion.c +++ b/src/regversion.c @@ -2,7 +2,7 @@ regversion.c - Oniguruma (regular expression library) **********************************************************************/ /*- - * Copyright (c) 2002-2018 K.Kosako <sndgk393 AT ybb DOT ne DOT jp> + * Copyright (c) 2002-2019 K.Kosako * All rights reserved. * * Redistribution and use in source and binary forms, with or without @@ -2,7 +2,7 @@ sjis.c - Oniguruma (regular expression library) **********************************************************************/ /*- - * Copyright (c) 2002-2018 K.Kosako <sndgk393 AT ybb DOT ne DOT jp> + * Copyright (c) 2002-2019 K.Kosako * All rights reserved. * * Redistribution and use in source and binary forms, with or without @@ -149,10 +149,6 @@ code_to_mbc(OnigCodePoint code, UChar *buf) if ((code & 0xff00) != 0) *p++ = (UChar )(((code >> 8) & 0xff)); *p++ = (UChar )(code & 0xff); -#if 0 - if (enclen(ONIG_ENCODING_SJIS, buf) != (p - buf)) - return REGERR_INVALID_CODE_POINT_VALUE; -#endif return (int )(p - buf); } @@ -179,31 +175,6 @@ mbc_case_fold(OnigCaseFoldType flag ARG_UNUSED, } } -#if 0 -static int -is_mbc_ambiguous(OnigCaseFoldType flag, const UChar** pp, const UChar* end) -{ - return onigenc_mbn_is_mbc_ambiguous(ONIG_ENCODING_SJIS, flag, pp, end); - -} -#endif - -#if 0 -static int -is_code_ctype(OnigCodePoint code, unsigned int ctype) -{ - if (code < 128) - return ONIGENC_IS_ASCII_CODE_CTYPE(code, ctype); - else { - if (CTYPE_IS_WORD_GRAPH_PRINT(ctype)) { - return (code_to_mbclen(code) > 1 ? TRUE : FALSE); - } - } - - return FALSE; -} -#endif - static UChar* left_adjust_char_head(const UChar* start, const UChar* s) { diff --git a/src/sjis_prop.c b/src/sjis_prop.c index 3a88a38..e33fbb2 100644 --- a/src/sjis_prop.c +++ b/src/sjis_prop.c @@ -1,5 +1,5 @@ /* ANSI-C code produced by gperf version 3.1 */ -/* Command-line: /usr/local/bin/gperf -pt -T -L ANSI-C -N onigenc_sjis_lookup_property_name --output-file gperf2.tmp sjis_prop.gperf */ +/* Command-line: gperf -pt -T -L ANSI-C -N onigenc_sjis_lookup_property_name --output-file gperf2.tmp sjis_prop.gperf */ /* Computed positions: -k'1,3' */ #if !((' ' == 32) && ('!' == 33) && ('"' == 34) && ('#' == 35) \ diff --git a/src/unicode.c b/src/unicode.c index 5820319..474436a 100644 --- a/src/unicode.c +++ b/src/unicode.c @@ -2,7 +2,7 @@ unicode.c - Oniguruma (regular expression library) **********************************************************************/ /*- - * Copyright (c) 2002-2019 K.Kosako <sndgk393 AT ybb DOT ne DOT jp> + * Copyright (c) 2002-2019 K.Kosako * All rights reserved. * * Redistribution and use in source and binary forms, with or without @@ -356,16 +356,15 @@ onigenc_unicode_get_case_fold_codes_by_str(OnigEncoding enc, for (fn = 0; fn < 2; fn++) { int index; cs[fn][0] = FOLDS2_FOLD(buk->index)[fn]; + ncs[fn] = 1; index = onigenc_unicode_fold1_key(&cs[fn][0]); if (index >= 0) { int m = FOLDS1_UNFOLDS_NUM(index); for (i = 0; i < m; i++) { cs[fn][i+1] = FOLDS1_UNFOLDS(index)[i]; } - ncs[fn] = m + 1; + ncs[fn] += m; } - else - ncs[fn] = 1; } for (i = 0; i < ncs[0]; i++) { @@ -393,16 +392,15 @@ onigenc_unicode_get_case_fold_codes_by_str(OnigEncoding enc, for (fn = 0; fn < 3; fn++) { int index; cs[fn][0] = FOLDS3_FOLD(buk->index)[fn]; + ncs[fn] = 1; index = onigenc_unicode_fold1_key(&cs[fn][0]); if (index >= 0) { int m = FOLDS1_UNFOLDS_NUM(index); for (i = 0; i < m; i++) { cs[fn][i+1] = FOLDS1_UNFOLDS(index)[i]; } - ncs[fn] = m + 1; + ncs[fn] += m; } - else - ncs[fn] = 1; } for (i = 0; i < ncs[0]; i++) { diff --git a/src/unicode_egcb_data.c b/src/unicode_egcb_data.c index 6a74c77..3c49422 100644 --- a/src/unicode_egcb_data.c +++ b/src/unicode_egcb_data.c @@ -1,6 +1,6 @@ /* unicode_egcb_data.c: Generated by make_unicode_egcb_data.py. */ /*- - * Copyright (c) 2017-2018 K.Kosako <sndgk393 AT ybb DOT ne DOT jp> + * Copyright (c) 2017-2019 K.Kosako * All rights reserved. * * Redistribution and use in source and binary forms, with or without @@ -25,7 +25,7 @@ * SUCH DAMAGE. */ -#define GRAPHEME_BREAK_PROPERTY_VERSION 12_1_0 +#define GRAPHEME_BREAK_PROPERTY_VERSION 120100 /* CR diff --git a/src/unicode_fold1_key.c b/src/unicode_fold1_key.c index b84b528..171a0fa 100644 --- a/src/unicode_fold1_key.c +++ b/src/unicode_fold1_key.c @@ -1,7 +1,7 @@ /* This file was converted by gperf_fold_key_conv.py from gperf output file. */ /* ANSI-C code produced by gperf version 3.1 */ -/* Command-line: /usr/local/bin/gperf -n -C -T -c -t -j1 -L ANSI-C -F,-1 -N onigenc_unicode_fold1_key unicode_fold1_key.gperf */ +/* Command-line: gperf -n -C -T -c -t -j1 -L ANSI-C -F,-1 -N onigenc_unicode_fold1_key unicode_fold1_key.gperf */ /* Computed positions: -k'1-3' */ @@ -9,7 +9,7 @@ /* This gperf source file was generated by make_unicode_fold_data.py */ /*- - * Copyright (c) 2017-2018 K.Kosako <sndgk393 AT ybb DOT ne DOT jp> + * Copyright (c) 2017-2019 K.Kosako * All rights reserved. * * Redistribution and use in source and binary forms, with or without @@ -2983,7 +2983,7 @@ onigenc_unicode_fold1_key(OnigCodePoint codes[]) 4026 }; - if (0 == 0) + { int key = hash(codes); diff --git a/src/unicode_fold2_key.c b/src/unicode_fold2_key.c index 2310f0a..c39b19d 100644 --- a/src/unicode_fold2_key.c +++ b/src/unicode_fold2_key.c @@ -1,7 +1,7 @@ /* This file was converted by gperf_fold_key_conv.py from gperf output file. */ /* ANSI-C code produced by gperf version 3.1 */ -/* Command-line: /usr/local/bin/gperf -n -C -T -c -t -j1 -L ANSI-C -F,-1 -N onigenc_unicode_fold2_key unicode_fold2_key.gperf */ +/* Command-line: gperf -n -C -T -c -t -j1 -L ANSI-C -F,-1 -N onigenc_unicode_fold2_key unicode_fold2_key.gperf */ /* Computed positions: -k'3,6' */ @@ -9,7 +9,7 @@ /* This gperf source file was generated by make_unicode_fold_data.py */ /*- - * Copyright (c) 2017-2018 K.Kosako <sndgk393 AT ybb DOT ne DOT jp> + * Copyright (c) 2017-2019 K.Kosako * All rights reserved. * * Redistribution and use in source and binary forms, with or without @@ -211,7 +211,7 @@ onigenc_unicode_fold2_key(OnigCodePoint codes[]) 129 }; - if (0 == 0) + { int key = hash(codes); diff --git a/src/unicode_fold3_key.c b/src/unicode_fold3_key.c index 0e02a62..295c447 100644 --- a/src/unicode_fold3_key.c +++ b/src/unicode_fold3_key.c @@ -1,7 +1,7 @@ /* This file was converted by gperf_fold_key_conv.py from gperf output file. */ /* ANSI-C code produced by gperf version 3.1 */ -/* Command-line: /usr/local/bin/gperf -n -C -T -c -t -j1 -L ANSI-C -F,-1 -N onigenc_unicode_fold3_key unicode_fold3_key.gperf */ +/* Command-line: gperf -n -C -T -c -t -j1 -L ANSI-C -F,-1 -N onigenc_unicode_fold3_key unicode_fold3_key.gperf */ /* Computed positions: -k'3,6,9' */ @@ -9,7 +9,7 @@ /* This gperf source file was generated by make_unicode_fold_data.py */ /*- - * Copyright (c) 2017-2018 K.Kosako <sndgk393 AT ybb DOT ne DOT jp> + * Copyright (c) 2017-2019 K.Kosako * All rights reserved. * * Redistribution and use in source and binary forms, with or without @@ -121,7 +121,7 @@ onigenc_unicode_fold3_key(OnigCodePoint codes[]) 0 }; - if (0 == 0) + { int key = hash(codes); diff --git a/src/unicode_fold_data.c b/src/unicode_fold_data.c index 0dbf9ae..68694b0 100644 --- a/src/unicode_fold_data.c +++ b/src/unicode_fold_data.c @@ -1,7 +1,7 @@ /* This file was generated by make_unicode_fold_data.py. */ #include "regenc.h" -#define UNICODE_CASEFOLD_VERSION 12_1_0 +#define UNICODE_CASEFOLD_VERSION 120100 OnigCodePoint OnigUnicodeFolds1[] = { diff --git a/src/unicode_property_data.c b/src/unicode_property_data.c index 5c1c8a9..0083dd6 100644 --- a/src/unicode_property_data.c +++ b/src/unicode_property_data.c @@ -1,5 +1,5 @@ /* ANSI-C code produced by gperf version 3.1 */ -/* Command-line: /usr/local/bin/gperf -T -C -c -t -j1 -L ANSI-C --ignore-case --pic -Q unicode_prop_name_pool -N unicode_lookup_property_name --output-file gperf1.tmp unicode_property_data.gperf */ +/* Command-line: gperf -T -C -c -t -j1 -L ANSI-C --ignore-case --pic -Q unicode_prop_name_pool -N unicode_lookup_property_name --output-file gperf1.tmp unicode_property_data.gperf */ /* Computed positions: -k'1-3,5-6,12,16,$' */ #if !((' ' == 32) && ('!' == 33) && ('"' == 34) && ('#' == 35) \ @@ -29580,7 +29580,8 @@ unicode_lookup_property_name (register const char *str, register size_t len) -#define UNICODE_PROPERTY_VERSION 12_1_0 +#define UNICODE_PROPERTY_VERSION 120100 +#define UNICODE_EMOJI_VERSION 1201 #define PROPERTY_NAME_MAX_SIZE 59 #define CODE_RANGES_NUM 568 diff --git a/src/unicode_property_data_posix.c b/src/unicode_property_data_posix.c index eddc108..e299e85 100644 --- a/src/unicode_property_data_posix.c +++ b/src/unicode_property_data_posix.c @@ -1,5 +1,5 @@ /* ANSI-C code produced by gperf version 3.1 */ -/* Command-line: /usr/local/bin/gperf -T -C -c -t -j1 -L ANSI-C --ignore-case --pic -Q unicode_prop_name_pool -N unicode_lookup_property_name --output-file gperf2.tmp unicode_property_data_posix.gperf */ +/* Command-line: gperf -T -C -c -t -j1 -L ANSI-C --ignore-case --pic -Q unicode_prop_name_pool -N unicode_lookup_property_name --output-file gperf2.tmp unicode_property_data_posix.gperf */ /* Computed positions: -k'1,3' */ #if !((' ' == 32) && ('!' == 33) && ('"' == 34) && ('#' == 35) \ diff --git a/src/unicode_unfold_key.c b/src/unicode_unfold_key.c index b2228e0..51a037b 100644 --- a/src/unicode_unfold_key.c +++ b/src/unicode_unfold_key.c @@ -1,7 +1,7 @@ /* This file was converted by gperf_unfold_key_conv.py from gperf output file. */ /* ANSI-C code produced by gperf version 3.1 */ -/* Command-line: /usr/local/bin/gperf -n -C -T -c -t -j1 -L ANSI-C -F,-1,0 -N onigenc_unicode_unfold_key unicode_unfold_key.gperf */ +/* Command-line: gperf -n -C -T -c -t -j1 -L ANSI-C -F,-1,0 -N onigenc_unicode_unfold_key unicode_unfold_key.gperf */ /* Computed positions: -k'1-3' */ @@ -9,7 +9,7 @@ /* This gperf source file was generated by make_unicode_fold_data.py */ /*- - * Copyright (c) 2017-2018 K.Kosako <sndgk393 AT ybb DOT ne DOT jp> + * Copyright (c) 2017-2019 K.Kosako * All rights reserved. * * Redistribution and use in source and binary forms, with or without @@ -3288,7 +3288,7 @@ onigenc_unicode_unfold_key(OnigCodePoint code) {0x1e907, 4005, 1} }; - if (0 == 0) + { int key = hash(&code); diff --git a/src/unicode_wb_data.c b/src/unicode_wb_data.c index 7778157..8e1a267 100644 --- a/src/unicode_wb_data.c +++ b/src/unicode_wb_data.c @@ -1,6 +1,6 @@ /* unicode_wb_data.c: Generated by make_unicode_wb_data.py. */ /*- - * Copyright (c) 2019 K.Kosako <kkosako0 AT gmail DOT com> + * Copyright (c) 2019 K.Kosako * All rights reserved. * * Redistribution and use in source and binary forms, with or without @@ -25,7 +25,7 @@ * SUCH DAMAGE. */ -#define WORD_BREAK_PROPERTY_VERSION 12_1_0 +#define WORD_BREAK_PROPERTY_VERSION 120100 /* ALetter diff --git a/src/utf16_be.c b/src/utf16_be.c index b66d868..d99af71 100644 --- a/src/utf16_be.c +++ b/src/utf16_be.c @@ -2,7 +2,7 @@ utf16_be.c - Oniguruma (regular expression library) **********************************************************************/ /*- - * Copyright (c) 2002-2019 K.Kosako <sndgk393 AT ybb DOT ne DOT jp> + * Copyright (c) 2002-2019 K.Kosako * All rights reserved. * * Redistribution and use in source and binary forms, with or without @@ -227,39 +227,6 @@ utf16be_mbc_case_fold(OnigCaseFoldType flag, pp, end, fold); } -#if 0 -static int -utf16be_is_mbc_ambiguous(OnigCaseFoldType flag, const UChar** pp, const UChar* end) -{ - const UChar* p = *pp; - - (*pp) += EncLen_UTF16[*p]; - - if (*p == 0) { - int c, v; - - p++; - if (*p == 0xdf && (flag & INTERNAL_ONIGENC_CASE_FOLD_MULTI_CHAR) != 0) { - return TRUE; - } - - c = *p; - v = ONIGENC_IS_UNICODE_ISO_8859_1_BIT_CTYPE(c, (BIT_CTYPE_UPPER | BIT_CTYPE_LOWER)); - - if ((v | BIT_CTYPE_LOWER) != 0) { - /* 0xaa, 0xb5, 0xba are lower case letter, but can't convert. */ - if (c >= 0xaa && c <= 0xba) - return FALSE; - else - return TRUE; - } - return (v != 0 ? TRUE : FALSE); - } - - return FALSE; -} -#endif - static UChar* utf16be_left_adjust_char_head(const UChar* start, const UChar* s) { diff --git a/src/utf16_le.c b/src/utf16_le.c index cdc74b0..c6edd94 100644 --- a/src/utf16_le.c +++ b/src/utf16_le.c @@ -2,7 +2,7 @@ utf16_le.c - Oniguruma (regular expression library) **********************************************************************/ /*- - * Copyright (c) 2002-2019 K.Kosako <sndgk393 AT ybb DOT ne DOT jp> + * Copyright (c) 2002-2019 K.Kosako * All rights reserved. * * Redistribution and use in source and binary forms, with or without @@ -227,39 +227,6 @@ utf16le_mbc_case_fold(OnigCaseFoldType flag, fold); } -#if 0 -static int -utf16le_is_mbc_ambiguous(OnigCaseFoldType flag, const UChar** pp, - const UChar* end) -{ - const UChar* p = *pp; - - (*pp) += EncLen_UTF16[*(p+1)]; - - if (*(p+1) == 0) { - int c, v; - - if (*p == 0xdf && (flag & INTERNAL_ONIGENC_CASE_FOLD_MULTI_CHAR) != 0) { - return TRUE; - } - - c = *p; - v = ONIGENC_IS_UNICODE_ISO_8859_1_BIT_CTYPE(c, - (BIT_CTYPE_UPPER | BIT_CTYPE_LOWER)); - if ((v | BIT_CTYPE_LOWER) != 0) { - /* 0xaa, 0xb5, 0xba are lower case letter, but can't convert. */ - if (c >= 0xaa && c <= 0xba) - return FALSE; - else - return TRUE; - } - return (v != 0 ? TRUE : FALSE); - } - - return FALSE; -} -#endif - static UChar* utf16le_left_adjust_char_head(const UChar* start, const UChar* s) { diff --git a/src/utf32_be.c b/src/utf32_be.c index dd17d3b..67e50a2 100644 --- a/src/utf32_be.c +++ b/src/utf32_be.c @@ -2,7 +2,7 @@ utf32_be.c - Oniguruma (regular expression library) **********************************************************************/ /*- - * Copyright (c) 2002-2018 K.Kosako <sndgk393 AT ybb DOT ne DOT jp> + * Copyright (c) 2002-2019 K.Kosako * All rights reserved. * * Redistribution and use in source and binary forms, with or without @@ -119,39 +119,6 @@ utf32be_mbc_case_fold(OnigCaseFoldType flag, fold); } -#if 0 -static int -utf32be_is_mbc_ambiguous(OnigCaseFoldType flag, const UChar** pp, const UChar* end) -{ - const UChar* p = *pp; - - (*pp) += 4; - - if (*(p+2) == 0 && *(p+1) == 0 && *p == 0) { - int c, v; - - p += 3; - if (*p == 0xdf && (flag & INTERNAL_ONIGENC_CASE_FOLD_MULTI_CHAR) != 0) { - return TRUE; - } - - c = *p; - v = ONIGENC_IS_UNICODE_ISO_8859_1_BIT_CTYPE(c, - (BIT_CTYPE_UPPER | BIT_CTYPE_LOWER)); - if ((v | BIT_CTYPE_LOWER) != 0) { - /* 0xaa, 0xb5, 0xba are lower case letter, but can't convert. */ - if (c >= 0xaa && c <= 0xba) - return FALSE; - else - return TRUE; - } - return (v != 0 ? TRUE : FALSE); - } - - return FALSE; -} -#endif - static UChar* utf32be_left_adjust_char_head(const UChar* start, const UChar* s) { diff --git a/src/utf32_le.c b/src/utf32_le.c index d9fe3c6..2ae2275 100644 --- a/src/utf32_le.c +++ b/src/utf32_le.c @@ -2,7 +2,7 @@ utf32_le.c - Oniguruma (regular expression library) **********************************************************************/ /*- - * Copyright (c) 2002-2018 K.Kosako <sndgk393 AT ybb DOT ne DOT jp> + * Copyright (c) 2002-2019 K.Kosako * All rights reserved. * * Redistribution and use in source and binary forms, with or without @@ -120,38 +120,6 @@ utf32le_mbc_case_fold(OnigCaseFoldType flag, fold); } -#if 0 -static int -utf32le_is_mbc_ambiguous(OnigCaseFoldType flag, const UChar** pp, const UChar* end) -{ - const UChar* p = *pp; - - (*pp) += 4; - - if (*(p+1) == 0 && *(p+2) == 0 && *(p+3) == 0) { - int c, v; - - if (*p == 0xdf && (flag & INTERNAL_ONIGENC_CASE_FOLD_MULTI_CHAR) != 0) { - return TRUE; - } - - c = *p; - v = ONIGENC_IS_UNICODE_ISO_8859_1_BIT_CTYPE(c, - (BIT_CTYPE_UPPER | BIT_CTYPE_LOWER)); - if ((v | BIT_CTYPE_LOWER) != 0) { - /* 0xaa, 0xb5, 0xba are lower case letter, but can't convert. */ - if (c >= 0xaa && c <= 0xba) - return FALSE; - else - return TRUE; - } - return (v != 0 ? TRUE : FALSE); - } - - return FALSE; -} -#endif - static UChar* utf32le_left_adjust_char_head(const UChar* start, const UChar* s) { @@ -2,7 +2,7 @@ utf8.c - Oniguruma (regular expression library) **********************************************************************/ /*- - * Copyright (c) 2002-2019 K.Kosako <sndgk393 AT ybb DOT ne DOT jp> + * Copyright (c) 2002-2019 K.Kosako * All rights reserved. * * Redistribution and use in source and binary forms, with or without @@ -97,33 +97,6 @@ is_valid_mbc_string(const UChar* p, const UChar* end) return TRUE; } -#if 0 -static int -is_mbc_newline(const UChar* p, const UChar* end) -{ - if (p < end) { - if (*p == 0x0a) return 1; - -#ifdef USE_UNICODE_ALL_LINE_TERMINATORS -#ifndef USE_CRNL_AS_LINE_TERMINATOR - if (*p == 0x0d) return 1; -#endif - if (p + 1 < end) { - if (*(p+1) == 0x85 && *p == 0xc2) /* U+0085 */ - return 1; - if (p + 2 < end) { - if ((*(p+2) == 0xa8 || *(p+2) == 0xa9) - && *(p+1) == 0x80 && *p == 0xe2) /* U+2028, U+2029 */ - return 1; - } - } -#endif - } - - return 0; -} -#endif - static OnigCodePoint mbc_to_code(const UChar* p, const UChar* end) { diff --git a/test/Makefile.am b/test/Makefile.am index 67b5d1e..4d62568 100644 --- a/test/Makefile.am +++ b/test/Makefile.am @@ -6,9 +6,9 @@ AM_CFLAGS = -Wall -Wno-invalid-source-encoding AM_CPPFLAGS = -I$(top_srcdir)/src if ENABLE_POSIX_API -TESTS = test_utf8 testc testp testcu +TESTS = test_utf8 testc testp testcu test_regset else -TESTS = test_utf8 testc testcu +TESTS = test_utf8 testc testcu test_regset endif check_PROGRAMS = $(TESTS) @@ -24,6 +24,9 @@ if ENABLE_POSIX_API endif @echo "[Oniguruma API, UTF-16 check]" @./testcu | grep RESULT + @echo "" + @echo "[Oniguruma API, regset check]" + @./test_regset test_uchar: @echo "[UChar in oniguruma.h check]" @@ -44,9 +47,13 @@ testp_CFLAGS = -DPOSIX_TEST -Wall -Wno-invalid-source-encoding testcu_SOURCES = testu.c testcu_LDADD = $(lib_onig) +test_regset_SOURCES = test_regset.c +test_regset_LDADD = $(lib_onig) + gcov: make CFLAGS="--coverage" test_utf8 make CFLAGS="--coverage" testc make CFLAGS="--coverage" testp make CFLAGS="--coverage" testcu + make CFLAGS="--coverage" test_regset diff --git a/test/test_regset.c b/test/test_regset.c new file mode 100644 index 0000000..497fbd6 --- /dev/null +++ b/test/test_regset.c @@ -0,0 +1,465 @@ +/* + * test_regset.c --- test for regset API + * Copyright (c) 2019 K.Kosako + */ +#include <stdlib.h> +#include <stdio.h> +#include <string.h> +#include <time.h> + +#include "oniguruma.h" + +static int nsucc = 0; +static int nfail = 0; +static int nerror = 0; + + +static int +make_regset(int line_no, int n, char* pat[], OnigRegSet** rset, int error_no) +{ + int r; + int i; + OnigRegSet* set; + regex_t* reg; + OnigErrorInfo einfo; + + *rset = NULL; + r = onig_regset_new(&set, 0, NULL); + if (r != 0) return r; + + for (i = 0; i < n; i++) { + r = onig_new(®, (UChar* )pat[i], (UChar* )(pat[i] + strlen(pat[i])), + ONIG_OPTION_DEFAULT, ONIG_ENCODING_UTF8, ONIG_SYNTAX_DEFAULT, + &einfo); + if (r != 0) { + char s[ONIG_MAX_ERROR_MESSAGE_LEN]; + + if (error_no == 0) { + onig_error_code_to_str((UChar* )s, r, &einfo); + fprintf(stderr, "ERROR: %d: %s /%s/\n", line_no, s, pat[i]); + nerror++; + } + else { + if (r == error_no) { + fprintf(stdout, "OK(ERROR): %d: /%s/ %d\n", line_no, pat[i], r); + nsucc++; + } + else { + fprintf(stdout, "FAIL(ERROR): %d: /%s/ %d, %d\n", + line_no, pat[i], error_no, r); + nfail++; + } + } + return r; + } + + r = onig_regset_add(set, reg); + if (r != 0) { + onig_regset_free(set); + fprintf(stderr, "ERROR: %d: onig_regset_add(): /%s/\n", line_no, pat[i]); + nerror++; + return r; + } + } + + *rset = set; + return 0; +} + +#ifndef _WIN32 + +static double +get_sec(struct timespec* ts, struct timespec* te) +{ + double t; + + t = (te->tv_sec - ts->tv_sec) + + (double )(te->tv_nsec - ts->tv_nsec) / 1000000000.0; + return t; +} + +/* clock_gettime() doesn't exist in Windows */ + +static int +time_test(int repeat, int n, char* ps[], char* s, char* end, double* rt_set, double* rt_reg) +{ + int r; + int i; + int match_pos; + OnigRegSet* set; + struct timespec ts1, ts2; + double t_set, t_reg; + + r = make_regset(0, n, ps, &set, 0); + if (r != 0) return r; + + clock_gettime(CLOCK_PROCESS_CPUTIME_ID, &ts1); + + for (i = 0; i < repeat; i++) { + r = onig_regset_search(set, (UChar* )s, (UChar* )end, (UChar* )s, (UChar* )end, + ONIG_REGSET_POSITION_LEAD, ONIG_OPTION_NONE, &match_pos); + if (r < 0) { + fprintf(stderr, "FAIL onig_regset_search(POSITION_LEAD): %d\n", r); + return r; + } + } + + clock_gettime(CLOCK_PROCESS_CPUTIME_ID, &ts2); + t_set = get_sec(&ts1, &ts2); + + + clock_gettime(CLOCK_PROCESS_CPUTIME_ID, &ts1); + + for (i = 0; i < repeat; i++) { + r = onig_regset_search(set, (UChar* )s, (UChar* )end, (UChar* )s, (UChar* )end, + ONIG_REGSET_REGEX_LEAD, ONIG_OPTION_NONE, &match_pos); + if (r < 0) { + fprintf(stderr, "FAIL onig_regset_search(REGEX_LEAD): %d\n", r); + return r; + } + } + + clock_gettime(CLOCK_PROCESS_CPUTIME_ID, &ts2); + onig_regset_free(set); + + t_reg = get_sec(&ts1, &ts2); + + *rt_set = t_set; + *rt_reg = t_reg; + return 0; +} +#endif + +static void +fisher_yates_shuffle(int n, char* ps[], char* cps[]) +{ +#define GET_RAND(n) (rand()%(n+1)) +#define SWAP(a,b) { char* tmp = a; a = b; b = tmp; } + + int i; + + for (i = 0; i < n; i++) + cps[i] = ps[i]; + + for (i = n - 1; i > 0; i--) { + int x = GET_RAND(i); + SWAP(cps[i], cps[x]); + } +} + +#ifndef _WIN32 +static void +time_compare(int n, char* ps[], char* s, char* end) +{ + int r; + int i; + int repeat; + double t_set, t_reg; + double total_set, total_reg; + char** cps; + + cps = (char** )malloc(sizeof(char*) * n); + if (cps == 0) return ; + + repeat = 100 / n; + total_set = total_reg = 0.0; + for (i = 0; i < n; i++) { + fisher_yates_shuffle(n, ps, cps); + r = time_test(repeat, n, cps, s, end, &t_set, &t_reg); + if (r != 0) return ; + total_set += t_set; + total_reg += t_reg; + } + + free(cps); + + fprintf(stdout, "POS lead: %6.2lfmsec. REG lead: %6.2lfmsec.\n", + total_set * 1000.0, total_reg * 1000.0); +} +#endif + + +static OnigRegSetLead XX_LEAD = ONIG_REGSET_POSITION_LEAD; + +static void +xx(int line_no, int n, char* ps[], char* s, int from, int to, int mem, int not, int error_no) +{ + int r; + int match_pos; + int match_index; + OnigRegSet* set; + char *end; + + r = make_regset(line_no, n, ps, &set, error_no); + if (r != 0) return ; + + end = s + strlen(s); + + r = onig_regset_search(set, (UChar* )s, (UChar* )end, (UChar* )s, (UChar* )end, + XX_LEAD, ONIG_OPTION_NONE, &match_pos); + if (r < 0) { + if (r == ONIG_MISMATCH) { + if (not) { + fprintf(stdout, "OK(N): %d\n", line_no); + nsucc++; + } + else { + fprintf(stdout, "FAIL: %d\n", line_no); + nfail++; + } + } + else { + if (error_no == 0) { + char buf[ONIG_MAX_ERROR_MESSAGE_LEN]; + onig_error_code_to_str((UChar* )buf, r); + fprintf(stderr, "ERROR: %d: %s\n", line_no, buf); + nerror++; + } + else { + if (r == error_no) { + fprintf(stdout, "OK(ERROR): %d: %d\n", line_no, r); + nsucc++; + } + else { + fprintf(stdout, "FAIL ERROR NO: %d: %d, %d\n", line_no, error_no, r); + nfail++; + } + } + } + } + else { + if (not) { + fprintf(stdout, "FAIL(N): %d\n", line_no); + nfail++; + } + else { + OnigRegion* region; + + match_index = r; + region = onig_regset_get_region(set, match_index); + if (region == 0) { + fprintf(stderr, "ERROR: %d: can't get region.\n", line_no); + nerror++; + return ; + } + + if (region->beg[mem] == from && region->end[mem] == to) { + fprintf(stdout, "OK: %d\n", line_no); + nsucc++; + } + else { + char buf[1000]; + int len; + len = region->end[mem] - region->beg[mem]; + strncpy(buf, s + region->beg[mem], len); + buf[len] = '\0'; + fprintf(stdout, "FAIL: %d: %d-%d : %d-%d (%s)\n", line_no, + from, to, region->beg[mem], region->end[mem], buf); + nfail++; + } + } + } + + onig_regset_free(set); +} + +static void +x2(int line_no, int n, char* ps[], char* s, int from, int to) +{ + xx(line_no, n, ps, s, from, to, 0, 0, 0); +} + +static void +x3(int line_no, int n, char* ps[], char* s, int from, int to, int mem) +{ + xx(line_no, n, ps, s, from, to, mem, 0, 0); +} + +static void +n(int line_no, int n, char* ps[], char* s) +{ + xx(line_no, n, ps, s, 0, 0, 0, 1, 0); +} + +#define ASIZE(a) sizeof(a)/sizeof(a[0]) +#define X2(ps,s,from,to) x2(__LINE__,ASIZE(ps),ps,s,from,to) +#define X3(ps,s,from,to,mem) x3(__LINE__,ASIZE(ps),ps,s,from,to,mem) +#define N(ps,s) n(__LINE__,ASIZE(ps),ps,s) +#define NZERO(s) n(__LINE__,0,(char** )0,s) + +#ifndef _WIN32 + +/* getdelim() doesn't exist in Windows */ + +static int +get_all_content_of_file(char* path, char** rs, char** rend) +{ + size_t len; + size_t n; + char* line; + FILE* fp; + + fp = fopen(path, "r"); + if (fp == 0) return -1; + + n = 0; + line = NULL; + len = getdelim(&line, &n, EOF, fp); + fclose(fp); + if (len < 0) return -2; + + *rs = line; + *rend = line + len; + return 0; +} +#endif + + +#define TEXT_PATH "kofu-utf8.txt" + +/* --- To get kofu.txt --- + $ wget https://www.aozora.gr.jp/cards/000148/files/774_ruby_1640.zip + $ unzip 774_ruby_1640.zip + $ nkf -Lu -w8 kofu.txt > kofu-utf8.txt + (convert encoding to utf-8 with BOM and line terminator to be Unix-form) +*/ + +static char* p1[] = { + "abc", + "(bca)", + "(cab)" +}; + +static char* p2[] = { + "小説", + "9", + "夏目漱石", +}; + +static char* p3[] = { + "^いる。", + "^校正", + "^底本", + "^ 翌日", +}; + +static char* p4[] = { + "《[^》]{5}》", + "《[^》]{6}》", + "《[^》]{7}》", + "《[^》]{8}》", + "《[^》]{9}》", + "《[^》]{10}》", + "《[^》]{11}》", + "《[^》]{12}》", + "《[^》]{13}》", + "《[^》]{14}》", + "《[^》]{15}》", + "《[^》]{16}》", + "《[^》]{17}》", + "《[^》]{18}》", + "《[^》]{19}》", + "《[^》]{20}》", +}; + +static char* p5[] = { + "小室圭", + "bbbbbb", + "ドナルド・トランプ", + "筑摩書房", + "松原", + "aaaaaaaaa", + "bbbbbbbbb", + "ccccc", + "ddddddddddd", + "eee", + "ffffffffffff", + "gggggggggg", + "hhhhhhhhhhhhhh", + "iiiiiii", +}; + +static char* p6[] = { + "^.{1000,}", + "松原", + "小室圭", + "ドナルド・トランプ", + "筑摩書房", +}; + +static char* p7[] = { + "0+", "1+", "2+", "3+", "4+", "5+", "6+", "7+", "8+", "9+", +}; + +extern int +main(int argc, char* argv[]) +{ + int r; + int file_exist; + char *s, *end; + OnigEncoding use_encs[1]; + + use_encs[0] = ONIG_ENCODING_UTF8; + onig_initialize(use_encs, sizeof(use_encs)/sizeof(use_encs[0])); + + srand(12345); + + XX_LEAD = ONIG_REGSET_POSITION_LEAD; + + NZERO(" abab bccab ca"); + X2(p1, " abab bccab ca", 8, 11); + X3(p1, " abab bccab ca", 8, 11, 1); + N(p2, " XXXX AAA 1223 012345678bbb"); + X2(p2, "0123456789", 9, 10); + X2(p7, "abcde 555 qwert", 6, 9); + + XX_LEAD = ONIG_REGSET_REGEX_LEAD; + + NZERO(" abab bccab ca"); + X2(p1, " abab bccab ca", 8, 11); + X3(p1, " abab bccab ca", 8, 11, 1); + N(p2, " XXXX AAA 1223 012345678bbb"); + X2(p2, "0123456789", 9, 10); + X2(p7, "abcde 555 qwert", 6, 9); + +#ifndef _WIN32 + r = get_all_content_of_file(TEXT_PATH, &s, &end); + if (r == 0) { + fprintf(stdout, "FILE: %s, size: %d\n", TEXT_PATH, (int )(end - s)); + file_exist = 1; + } + else { + fprintf(stdout, "Ignore %s\n", TEXT_PATH); + file_exist = 0; + } +#else + file_exist = 0; +#endif + + if (file_exist != 0) { + X2(p2, s, 10, 22); + X2(p3, s, 496079, 496088); + X2(p4, s, 1294, 1315); + } + + fprintf(stdout, + "\nRESULT SUCC: %4d, FAIL: %d, ERROR: %d (by Oniguruma %s)\n", + nsucc, nfail, nerror, onig_version()); + + if (file_exist != 0) { +#ifndef _WIN32 + fprintf(stdout, "\n"); + time_compare(ASIZE(p2), p2, s, end); + time_compare(ASIZE(p3), p3, s, end); + time_compare(ASIZE(p4), p4, s, end); + time_compare(ASIZE(p5), p5, s, end); + time_compare(ASIZE(p6), p6, s, end); + fprintf(stdout, "\n"); +#endif + free(s); + } + + onig_end(); + + return ((nfail == 0 && nerror == 0) ? 0 : -1); +} diff --git a/test/test_utf8.c b/test/test_utf8.c index 2338526..d6fc761 100644 --- a/test/test_utf8.c +++ b/test/test_utf8.c @@ -132,8 +132,9 @@ static void e(char* pattern, char* str, int error_no) extern int main(int argc, char* argv[]) { - static OnigEncoding use_encs[] = { ONIG_ENCODING_UTF8 }; + OnigEncoding use_encs[1]; + use_encs[0] = ONIG_ENCODING_UTF8; onig_initialize(use_encs, sizeof(use_encs)/sizeof(use_encs[0])); err_file = stdout; @@ -298,6 +299,8 @@ extern int main(int argc, char* argv[]) x2("(?i:xssy)", "xs\xc5\xbfy", 0, 5); x2("(?i:xssy)", "x\xc3\x9fy", 0, 4); x2("(?i:xssy)", "x\xe1\xba\x9ey", 0, 5); + x2("(?i:x\xc3\x9fy)", "xssy", 0, 4); + x2("(?i:x\xc3\x9fy)", "xSSy", 0, 4); x2("(?i:\xc3\x9f)", "ss", 0, 2); x2("(?i:\xc3\x9f)", "SS", 0, 2); x2("(?i:[\xc3\x9f])", "ss", 0, 2); @@ -1204,6 +1207,78 @@ extern int main(int argc, char* argv[]) x2("a{2,3}+a", "aaa", 0, 3); /* == (?:a{2,3})+*/ x2("[\\x{0}-\\x{7fffffff}]", "a", 0, 1); x2("[\\x{7f}-\\x{7fffffff}]", "\xe5\xae\xb6", 0, 3); + x2("[a[cdef]]", "a", 0, 1); + n("[a[xyz]-c]", "b"); + x2("[a[xyz]-c]", "a", 0, 1); + x2("[a[xyz]-c]", "-", 0, 1); + x2("[a[xyz]-c]", "c", 0, 1); + + x2("((?(a)\\g<1>|b))", "aab", 0, 3); + x2("((?(a)\\g<1>))", "aab", 0, 2); + x2("(b(?(a)|\\g<1>))", "bba", 0, 3); + e("(()(?(2)\\g<1>))", "", ONIGERR_NEVER_ENDING_RECURSION); + + x2("(?i)st", "st", 0, 2); + x2("(?i)st", "St", 0, 2); + x2("(?i)st", "sT", 0, 2); + x2("(?i)st", "\xC5\xBFt", 0, 3); // U+017F + x2("(?i)st", "\xEF\xAC\x85", 0, 3); // U+FB05 + x2("(?i)st", "\xEF\xAC\x86", 0, 3); // U+FB06 + x2("(?i)ast", "Ast", 0, 3); + x2("(?i)ast", "ASt", 0, 3); + x2("(?i)ast", "AsT", 0, 3); + x2("(?i)ast", "A\xC5\xBFt", 0, 4); // U+017F + x2("(?i)ast", "A\xEF\xAC\x85", 0, 4); // U+FB05 + x2("(?i)ast", "A\xEF\xAC\x86", 0, 4); // U+FB06 + x2("(?i)stZ", "stz", 0, 3); + x2("(?i)stZ", "Stz", 0, 3); + x2("(?i)stZ", "sTz", 0, 3); + x2("(?i)stZ", "\xC5\xBFtz", 0, 4); // U+017F + x2("(?i)stZ", "\xEF\xAC\x85z", 0, 4); // U+FB05 + x2("(?i)stZ", "\xEF\xAC\x86z", 0, 4); // U+FB06 + x2("(?i)BstZ", "bstz", 0, 4); + x2("(?i)BstZ", "bStz", 0, 4); + x2("(?i)BstZ", "bsTz", 0, 4); + x2("(?i)BstZ", "b\xC5\xBFtz", 0, 5); // U+017F + x2("(?i)BstZ", "b\xEF\xAC\x85z", 0, 5); // U+FB05 + x2("(?i)BstZ", "b\xEF\xAC\x86z", 0, 5); // U+FB06 + x2("(?i).*st\\z", "tttssss\xC5\xBFt", 0, 10); // U+017F + x2("(?i).*st\\z", "tttssss\xEF\xAC\x85", 0, 10); // U+FB05 + x2("(?i).*st\\z", "tttssss\xEF\xAC\x86", 0, 10); // U+FB06 + x2("(?i).*あstい\\z", "tttssssあ\xC5\xBFtい", 0, 16); // U+017F + x2("(?i).*あstい\\z", "tttssssあ\xEF\xAC\x85い", 0, 16); // U+FB05 + x2("(?i).*あstい\\z", "tttssssあ\xEF\xAC\x86い", 0, 16); // U+FB06 + x2("(?i).*\xC5\xBFt\\z", "tttssssst", 0, 9); // U+017F + x2("(?i).*\xEF\xAC\x85\\z", "tttssssあst", 0, 12); // U+FB05 + x2("(?i).*\xEF\xAC\x86い\\z", "tttssssstい", 0, 12); // U+FB06 + x2("(?i).*\xEF\xAC\x85\\z", "tttssssあ\xEF\xAC\x85", 0, 13); + + x2("(?i).*ss", "abcdefghijklmnopqrstuvwxyz\xc3\x9f", 0, 28); // U+00DF + x2("(?i).*ss.*", "abcdefghijklmnopqrstuvwxyz\xc3\x9fxyz", 0, 31); // U+00DF + x2("(?i).*\xc3\x9f", "abcdefghijklmnopqrstuvwxyzss", 0, 28); // U+00DF + x2("(?i).*ss.*", "abcdefghijklmnopqrstuvwxyzSSxyz", 0, 31); + + x2("(?i)ssv", "\xc3\x9fv", 0, 3); // U+00DF + x2("(?i)(?<=ss)v", "SSv", 2, 3); + x2("(?i)(?<=\xc3\x9f)v", "\xc3\x9fv", 2, 3); + //x2("(?i)(?<=\xc3\x9f)v", "ssv", 2, 3); + //x2("(?i)(?<=ss)v", "\xc3\x9fv", 2, 3); + + /* #156 U+01F0 (UTF-8: C7 B0) */ + x2("(?i).+Isssǰ", ".+Isssǰ", 0, 8); + x2(".+Isssǰ", ".+Isssǰ", 0, 8); + x2("(?i)ǰ", "ǰ", 0, 2); + x2("(?i)ǰ", "j\xcc\x8c", 0, 3); + x2("(?i)j\xcc\x8c", "ǰ", 0, 2); + x2("(?i)5ǰ", "5ǰ", 0, 3); + x2("(?i)5ǰ", "5j\xcc\x8c", 0, 4); + x2("(?i)5j\xcc\x8c", "5ǰ", 0, 3); + x2("(?i)ǰv", "ǰV", 0, 3); + x2("(?i)ǰv", "j\xcc\x8cV", 0, 4); + x2("(?i)j\xcc\x8cv", "ǰV", 0, 3); + x2("(?i)[ǰ]", "ǰ", 0, 2); + x2("(?i)[ǰ]", "j\xcc\x8c", 0, 3); + //x2("(?i)[j]\xcc\x8c", "ǰ", 0, 2); n(" \xfd", ""); /* https://bugs.php.net/bug.php?id=77370 */ /* can't use \xfc00.. because compiler error: hex escape sequence out of range */ @@ -1212,7 +1287,10 @@ extern int main(int argc, char* argv[]) e("(?i)000000000000000000000\xf0", "", ONIGERR_INVALID_CODE_POINT_VALUE); /* https://bugs.php.net/bug.php?id=77382 */ n("0000\\\xf5", "0"); /* https://bugs.php.net/bug.php?id=77385 */ n("(?i)FFF00000000000000000\xfd", ""); /* https://bugs.php.net/bug.php?id=77394 */ - + e("x{55380}{77590}", "", ONIGERR_TOO_BIG_NUMBER_FOR_REPEAT_RANGE); + e("(xyz){40000}{99999}(?<name>vv)", "", ONIGERR_TOO_BIG_NUMBER_FOR_REPEAT_RANGE); + e("f{90000,90000}{80000,80000}", "", ONIGERR_TOO_BIG_NUMBER_FOR_REPEAT_RANGE); + n("f{90000,90000}{80000,80001}", ""); x2("\\p{Common}", "\xe3\x8b\xbf", 0, 3); /* U+32FF */ x2("\\p{In_Enclosed_CJK_Letters_and_Months}", "\xe3\x8b\xbf", 0, 3); /* U+32FF */ diff --git a/test/testc.c b/test/testc.c index c3174cd..5c60764 100644 --- a/test/testc.c +++ b/test/testc.c @@ -153,8 +153,9 @@ static void n(char* pattern, char* str) extern int main(int argc, char* argv[]) { #ifndef POSIX_TEST - static OnigEncoding use_encs[] = { ONIG_ENCODING_EUC_JP }; + OnigEncoding use_encs[1]; + use_encs[0] = ONIG_ENCODING_EUC_JP; onig_initialize(use_encs, sizeof(use_encs)/sizeof(use_encs[0])); #endif diff --git a/test/testu.c b/test/testu.c index 397da95..24397ab 100644 --- a/test/testu.c +++ b/test/testu.c @@ -190,8 +190,9 @@ static void n(char* pattern, char* str) extern int main(int argc, char* argv[]) { - static OnigEncoding use_encs[] = { ONIG_ENCODING_UTF16_BE }; + OnigEncoding use_encs[1]; + use_encs[0] = ONIG_ENCODING_UTF16_BE; onig_initialize(use_encs, sizeof(use_encs)/sizeof(use_encs[0])); err_file = stdout; |