From b134093d75235a90f09ff591137aed9dbdad6e89 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?J=C3=B6rg=20Frings-F=C3=BCrst?= Date: Wed, 7 Aug 2019 09:32:37 +0200 Subject: Correct typo in watch file --- debian/watch | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/debian/watch b/debian/watch index 8a7b475..2f0e85f 100644 --- a/debian/watch +++ b/debian/watch @@ -4,4 +4,4 @@ dversionmangle=s/\+(debian|dfsg|ds|deb)\d*$//,\ uversionmangle=s/(\d)[_\.\-\+]?((RC|rc|pre|dev|beta|alpha)\d*)$/$1~$2/;s/RC/rc/;s/\-/\./g;s/\_/\./g,\ filenamemangle=s/(?:.*?)?(?:rel|v|oniguruma|ONIGURUMA)?[\-\_]?(\d\S+)\.(tgz|tbz|txz|(?:tar\.(?:gz|bz2|xz)))/oniguruma-$1.$2/ \ https://github.com/kkos/oniguruma/tags \ -(?:.*?/)?(?:rel|v|oniguruma|ONIGURUMA)?[\-\_]?(\d\S+)\.(?:tgz|tbz|txz|(?:tar\.(?:gz|bz2|xz))) \ +(?:.*?/)?(?:rel|v|oniguruma|ONIGURUMA)?[\-\_]?(\d\S+)\.(?:tgz|tbz|txz|(?:tar\.(?:gz|bz2|xz))) -- cgit v1.2.3 From 40f3d0030e6e98bcb02d6523e5ee48497dec49a6 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?J=C3=B6rg=20Frings-F=C3=BCrst?= Date: Wed, 7 Aug 2019 09:32:48 +0200 Subject: New upstream version 6.9.3 --- .gitignore | 3 + CMakeLists.txt | 41 ++++--- HISTORY | 14 +++ Makefile.am | 6 + README.md | 71 ++++++----- build_harnesses.sh | 31 +++++ configure.ac | 2 +- contributed/libfuzzer-onig.cpp | 2 + doc/API | 26 ++++- doc/API.ja | 20 +++- doc/UNICODE_PROPERTIES | 2 +- harnesses/ascii_compatible.dict | 111 ++++++++++++++++++ harnesses/deluxe-encode-harness.c | 239 ++++++++++++++++++++++++++++++++++++++ harnesses/dict_conv.py | 72 ++++++++++++ harnesses/encode-harness.c | 170 +++++++++++++++++++++++++++ harnesses/syntax-harness.c | 120 +++++++++++++++++++ index.html | 4 +- index_ja.html | 4 +- sample/bug_fix.c | 56 ++------- sample/crnl.c | 2 + sample/encode.c | 142 ++++------------------ sample/listcap.c | 2 + sample/names.c | 3 + sample/posix.c | 5 + sample/scan.c | 2 + sample/simple.c | 3 + sample/sql.c | 4 + sample/syntax.c | 2 + sample/user_property.c | 5 + src/gb18030.c | 6 +- src/oniguruma.h | 11 +- src/regcomp.c | 156 +++++++++++++++---------- src/regenc.c | 2 + src/regerror.c | 17 +++ src/regexec.c | 130 ++++++++++++++------- src/regext.c | 6 +- src/regint.h | 6 +- src/regparse.c | 190 +++++++++++++++++------------- src/regparse.h | 22 ++-- src/utf16_be.c | 35 +++++- src/utf16_le.c | 26 ++++- test/test_utf8.c | 13 +++ test/testu.c | 15 --- 43 files changed, 1350 insertions(+), 449 deletions(-) create mode 100755 build_harnesses.sh create mode 100644 harnesses/ascii_compatible.dict create mode 100644 harnesses/deluxe-encode-harness.c create mode 100644 harnesses/dict_conv.py create mode 100644 harnesses/encode-harness.c create mode 100644 harnesses/syntax-harness.c diff --git a/.gitignore b/.gitignore index 6af6a82..227b7df 100644 --- a/.gitignore +++ b/.gitignore @@ -32,6 +32,7 @@ Makefile.in m4/*.m4 /coverage /coverage.info +/fuzzers # src/ /src/CaseFolding.txt @@ -62,3 +63,5 @@ m4/*.m4 /sample/count /sample/bug_fix /sample/log* + +/harnesses/utf16*.dict diff --git a/CMakeLists.txt b/CMakeLists.txt index f3eca6b..c59bfe3 100644 --- a/CMakeLists.txt +++ b/CMakeLists.txt @@ -1,28 +1,19 @@ cmake_minimum_required(VERSION 3.1) -project(oniguruma VERSION 6.9.2) +project(oniguruma + VERSION 6.9.3 + LANGUAGES C) set(PACKAGE onig) set(PACKAGE_VERSION ${PROJECT_VERSION}) option(BUILD_SHARED_LIBS "Build shared libraries" ON) option(ENABLE_POSIX_API "Include POSIX API" ON) - -set(USE_CRNL_AS_LINE_TERMINATOR 0) -set(VERSION ${PACKAGE_VERSION}) - if(MSVC) - # Force to always compile with W4 - if(CMAKE_CXX_FLAGS MATCHES "/W[0-4]") - string(REGEX REPLACE "/W[0-4]" "/W4" CMAKE_CXX_FLAGS "${CMAKE_CXX_FLAGS}") - else() - set(CMAKE_CXX_FLAGS "${CMAKE_CXX_FLAGS} /W4") - endif() -elseif(CMAKE_COMPILER_IS_GNUCXX) - set(CMAKE_CXX_FLAGS "${CMAKE_CXX_FLAGS} -Wall") -elseif(CMAKE_COMPILER_IS_GNUCC) - set(CMAKE_C_FLAGS "${CMAKE_C_FLAGS} -Wall") + option(MSVC_STATIC_RUNTIME "Build with static runtime" OFF) endif() +set(USE_CRNL_AS_LINE_TERMINATOR 0) +set(VERSION ${PACKAGE_VERSION}) include(CheckCSourceCompiles) include(CheckIncludeFiles) @@ -73,6 +64,26 @@ target_include_directories(onig PUBLIC $ $) +if(MSVC) + target_compile_options(onig PRIVATE + #/W4 + ) + if(MSVC_STATIC_RUNTIME) + target_compile_options(onig PRIVATE + $<$:/MT> + $<$:/MTd> + $<$:/MT> + $<$:/MTd> + ) + target_compile_definitions(onig PUBLIC -DONIG_STATIC) + endif() +elseif(CMAKE_COMPILER_IS_GNUCC) + target_compile_options(onig PRIVATE + -Wall + ) +endif() + + # Installation (https://github.com/forexample/package-example) # Introduce variables: diff --git a/HISTORY b/HISTORY index 3649e4e..0380cb4 100644 --- a/HISTORY +++ b/HISTORY @@ -1,5 +1,19 @@ History +2019/08/06: Version 6.9.3 (secirity fix release) + +2019/07/30: add ONIG_SYN_ALLOW_INVALID_CODE_END_OF_RANGE +2019/07/29: add STK_PREC_READ_START/END stack type +2019/07/29: Fix #147: Stack Exhaustion Problem caused by some parsing functions +2019/07/11: add a dictionary file for libfuzzer +2019/07/07: add harnesses directory +2019/07/05-2019/07/29: fix many problems found by libfuzzer programs +2019/06/27: deprecate onig_new_deluxe() +2019/06/27: Fix CVE-2019-13224: don't allow different encodings for onig_new_deluxe() +2019/06/27: Fix CVE-2019-13225: problem in converting if-then-else pattern + +2019/05/07: Version 6.9.2 (same as Release Candidate 3) + 2019/04/23: Release Candidate 3 for 6.9.2 2019/04/23: add doc/SYNTAX.md into distribution file 2019/04/09: Release Candidate 2 for 6.9.2 diff --git a/Makefile.am b/Makefile.am index 6045eae..a0bbc7b 100644 --- a/Makefile.am +++ b/Makefile.am @@ -39,6 +39,12 @@ pkgconfig_DATA = oniguruma.pc all-test: cd test; make test +sanitize: + make clean + ./configure CC=clang CFLAGS="-O -g -fsanitize=address" + make + make all-test + cov: make lcov-clear cd test; make CFLAGS="--coverage" test diff --git a/README.md b/README.md index 873f86d..6a4783b 100644 --- a/README.md +++ b/README.md @@ -27,46 +27,55 @@ Supported character encodings: * doc/SYNTAX.md: contributed by seanofw -New feature of version 6.9.2 ------------------------------------ +Version 6.9.3 (security fix release) +------------------------------------ +* Fixed CVE-2019-13224 +* Fixed CVE-2019-13225 +* Fixed many problems (found by libfuzzer programs) + + +Version 6.9.2 (Reiwa) +--------------------- + +* add doc/SYNTAX.md * Update Unicode version 12.1.0 -* NEW: Unicode Text Segment mode option (?y{g}) (?y{w}) +* NEW: Unicode Text Segment mode option (?y{g}) (?y{w}) (*original) g: Extended Grapheme Cluster mode / w: Word mode (Unicode Standard Annex #29 [http://unicode.org/reports/tr29/]) -New feature of version 6.9.1 --------------------------- +Version 6.9.1 +------------- * Speed improvement (* especially UTF-8) -New feature of version 6.9.0 --------------------------- +Version 6.9.0 +------------- * Update Unicode version 11.0.0 * NEW: add Emoji properties -New feature of version 6.8.2 --------------------------- +Version 6.8.2 +------------- * Fix: #80 UChar in header causes issue * NEW API: onig_set_callout_user_data_of_match_param() (* omission in 6.8.0) * add doc/CALLOUTS.API and doc/CALLOUTS.API.ja -New feature of version 6.8.1 --------------------------- +Version 6.8.1 +------------- * Update shared library version to 5.0.0 for API incompatible changes from 6.7.1 -New feature of version 6.8.0 --------------------------- +Version 6.8.0 +------------- * Retry-limit-in-match function enabled by default * NEW: configure option --enable-posix-api=no (* enabled by default) @@ -77,14 +86,14 @@ New feature of version 6.8.0 * Examples of Callouts program: [callout.c](sample/callout.c), [count.c](sample/count.c), [echo.c](sample/echo.c) -New feature of version 6.7.1 --------------------------- +Version 6.7.1 +------------- * NEW: Mechanism of retry-limit-in-match (* disabled by default) -New feature of version 6.7.0 --------------------------- +Version 6.7.0 +------------- * NEW: hexadecimal codepoint \uHHHH * NEW: add ONIG_SYNTAX_ONIGURUMA (== ONIG_SYNTAX_DEFAULT) @@ -92,8 +101,8 @@ New feature of version 6.7.0 * Reduced size of object file -New feature of version 6.6.0 --------------------------- +Version 6.6.0 +------------- * NEW: ASCII only mode options for character type/property (?WDSP) * NEW: Extended Grapheme Cluster boundary \y, \Y @@ -101,8 +110,8 @@ New feature of version 6.6.0 * Range-clear (Absent-clear) operator restores previous range in retractions. -New feature of version 6.5.0 --------------------------- +Version 6.5.0 +------------- * NEW: \K (keep) * NEW: \R (general newline) \N (no newline) @@ -114,16 +123,16 @@ New feature of version 6.5.0 * NEW: Absent stopper (?~|absent) (*original) -New feature of version 6.4.0 --------------------------- +Version 6.4.0 +------------- * Fix fatal problem of endless repeat on Windows * NEW: call zero (call the total regexp) \g<0> * NEW: relative backref/call by positive number \k<+n>, \g<+n> -New feature of version 6.3.0 --------------------------- +Version 6.3.0 +------------- * NEW: octal codepoint \o{.....} * Fixed CVE-2017-9224 @@ -134,20 +143,20 @@ New feature of version 6.3.0 * Fixed CVE-2017-9229 -New feature of version 6.1.2 --------------------------- +Version 6.1.2 +------------- * allow word bound, word begin and word end in look-behind. * NEW option: ONIG_OPTION_CHECK_VALIDITY_OF_STRING -New feature of version 6.1 --------------------------- +Version 6.1 +----------- * improved doc/RE * NEW API: onig_scan() -New feature of version 6.0 --------------------------- +Version 6.0 +----------- * Update Unicode 8.0 Property/Case-folding * NEW API: onig_unicode_define_user_property() diff --git a/build_harnesses.sh b/build_harnesses.sh new file mode 100755 index 0000000..54dc9ff --- /dev/null +++ b/build_harnesses.sh @@ -0,0 +1,31 @@ +#!/bin/bash + +make clean +autoreconf -vfi + +# build the library with ASAN +#NO_LINK="-fsanitize=fuzzer-no-link" +NO_LINK="" +./configure CC=clang LD=clang CFLAGS="-g -fsanitize=address -fno-omit-frame-pointer $NO_LINK" LDFLAGS="-g -fsanitize=address -fno-omit-frame-pointer $NO_LINK" +make -j4 + +OUT=`pwd`/fuzzers +mkdir -p $OUT +LIBFUZZER_FLAGS="-fsanitize=fuzzer,address -fno-omit-frame-pointer" +#LIBS="src/.libs/libonig.a" +LIBS="src/.libs/libonig.a /usr/local/lib/libLLVMFuzzerMain.a" + +CFLAGS="-Isrc -g $LIBFUZZER_FLAGS" + +# Libfuzzer builds +clang++ contributed/libfuzzer-onig.cpp $LIBS $CFLAGS -o $OUT/libfuzzer-onig +clang harnesses/syntax-harness.c $LIBS $CFLAGS -o $OUT/syntax-libfuzzer +clang harnesses/encode-harness.c $LIBS $CFLAGS -o $OUT/encode-libfuzzer +clang harnesses/deluxe-encode-harness.c $LIBS $CFLAGS -o $OUT/deluxe-encode-libfuzzer + +clang -DUTF16_BE harnesses/encode-harness.c $LIBS $CFLAGS -o $OUT/utf16-be-libfuzzer +clang -DUTF16_LE harnesses/encode-harness.c $LIBS $CFLAGS -o $OUT/utf16-le-libfuzzer +clang -DWITH_READ_MAIN harnesses/encode-harness.c src/.libs/libonig.a $CFLAGS -o $OUT/main-encode +clang -DWITH_READ_MAIN -DUTF16_LE harnesses/encode-harness.c src/.libs/libonig.a $CFLAGS -o $OUT/main-utf16-le +clang -DWITH_READ_MAIN -DUTF16_BE harnesses/encode-harness.c src/.libs/libonig.a $CFLAGS -o $OUT/main-utf16-be +clang -DWITH_READ_MAIN harnesses/deluxe-encode-harness.c $LIBS $CFLAGS -o $OUT/main-deluxe-encode diff --git a/configure.ac b/configure.ac index 010a0d8..62c9fa5 100644 --- a/configure.ac +++ b/configure.ac @@ -1,5 +1,5 @@ dnl Process this file with autoconf to produce a configure script. -AC_INIT(onig, 6.9.2) +AC_INIT(onig, 6.9.3) AC_CONFIG_MACRO_DIR([m4]) diff --git a/contributed/libfuzzer-onig.cpp b/contributed/libfuzzer-onig.cpp index e137b73..526c826 100644 --- a/contributed/libfuzzer-onig.cpp +++ b/contributed/libfuzzer-onig.cpp @@ -29,6 +29,8 @@ extern "C" int LLVMFuzzerTestOneInput(const uint8_t * Data, size_t Size) #ifdef FULL_TEST onig_initialize(&enc, 1); + onig_set_retry_limit_in_match(120); + onig_set_parse_depth_limit(120); #endif if (onig_new(®, Data, Data + Size, ONIG_OPTION_DEFAULT, enc, diff --git a/doc/API b/doc/API index 2309e5e..049db02 100644 --- a/doc/API +++ b/doc/API @@ -1,4 +1,4 @@ -Oniguruma API Version 6.9.2 2019/03/25 +Oniguruma API Version 6.9.3 2019/07/06 #include @@ -168,6 +168,9 @@ Oniguruma API Version 6.9.2 2019/03/25 # int onig_new_deluxe(regex_t** reg, const UChar* pattern, const UChar* pattern_end, OnigCompileInfo* ci, OnigErrorInfo* einfo) + This function is deprecate, and it does not allow the case where + the encoding of pattern and target is different. + Create a regex object. This function is deluxe version of onig_new(). @@ -299,6 +302,7 @@ Oniguruma API Version 6.9.2 2019/03/25 const UChar* range, OnigRegion* region, OnigOptionType option) Search string and return search result and matching region. + Do not pass invalid byte string in the regex character encoding. normal return: match position offset (i.e. p - str >= 0) not found: ONIG_MISMATCH (< 0) @@ -323,15 +327,19 @@ Oniguruma API Version 6.9.2 2019/03/25 const UChar* start, const UChar* range, OnigRegion* region, OnigOptionType option, OnigMatchParam* mp) - arguments - 1-7: same as onig_search() - 8 mp: match parameter values (match_stack_limit, retry_limit_in_match) + Search string and return search result and matching region. + Do not pass invalid byte string in the regex character encoding. + + arguments + 1-7: same as onig_search() + 8 mp: match parameter values (match_stack_limit, retry_limit_in_match) # int onig_match(regex_t* reg, const UChar* str, const UChar* end, const UChar* at, OnigRegion* region, OnigOptionType option) Match string and return result and matching region. + Do not pass invalid byte string in the regex character encoding. normal return: match length (>= 0) not match: ONIG_MISMATCH ( < 0) @@ -353,6 +361,9 @@ Oniguruma API Version 6.9.2 2019/03/25 const UChar* at, OnigRegion* region, OnigOptionType option, OnigMatchParam* mp) + Match string and return result and matching region. + Do not pass invalid byte string in the regex character encoding. + arguments 1-6: same as onig_match() 7 mp: match parameter values (match_stack_limit, retry_limit_in_match) @@ -364,6 +375,7 @@ Oniguruma API Version 6.9.2 2019/03/25 void* callback_arg) Scan string and callback with matching region. + Do not pass invalid byte string in the regex character encoding. normal return: number of matching times error: error code @@ -611,14 +623,20 @@ Oniguruma API Version 6.9.2 2019/03/25 # int onigenc_strlen(OnigEncoding enc, const UChar* s, const UChar* end) + + Return number of characters in the string. + + # int onigenc_strlen_null(OnigEncoding enc, const UChar* s) Return number of characters in the string. + Do not pass invalid byte string in the character encoding. # int onigenc_str_bytelen_null(OnigEncoding enc, const UChar* s) Return number of bytes in the string. + Do not pass invalid byte string in the character encoding. # int onig_set_default_syntax(OnigSyntaxType* syntax) diff --git a/doc/API.ja b/doc/API.ja index 164d0b8..5871558 100644 --- a/doc/API.ja +++ b/doc/API.ja @@ -1,4 +1,4 @@ -鬼車インターフェース Version 6.9.2 2019/03/29 +鬼車インターフェース Version 6.9.3 2019/07/06 #include @@ -167,6 +167,9 @@ # int onig_new_deluxe(regex_t** reg, const UChar* pattern, const UChar* pattern_end, OnigCompileInfo* ci, OnigErrorInfo* einfo) + この関数は廃止予定。 + パターンと対象文字列の文字エンコーディングが異なる場合を許さなくなった。 + 正規表現オブジェクト(regex)を作成する。 この関数は、onig_new()のデラックス版。 @@ -298,6 +301,7 @@ const UChar* range, OnigRegion* region, OnigOptionType option) 正規表現で文字列を検索し、検索結果とマッチ領域を返す。 + 正規表現オブジェクトの文字エンコーディングで、検索文字列として不正な文字列を渡してはいけない。 正常終了戻り値: マッチ位置 (p - str >= 0) 検索失敗: ONIG_MISMATCH (< 0) @@ -322,6 +326,9 @@ const UChar* start, const UChar* range, OnigRegion* region, OnigOptionType option, OnigMatchParam* mp) + 正規表現で文字列を検索し、検索結果とマッチ領域を返す。 + 正規表現オブジェクトの文字エンコーディングで、検索文字列として不正な文字列を渡してはいけない。 + 引数 1-7: onig_search()と同じ 8 mp: マッチパラメタ値 (match_stack_limit, retry_limit_in_match) @@ -331,6 +338,7 @@ const UChar* at, OnigRegion* region, OnigOptionType option) 文字列の指定位置でマッチングを行い、結果とマッチ領域を返す。 + 正規表現オブジェクトの文字エンコーディングで、検索文字列として不正な文字列を渡してはいけない。 正常終了戻り値: マッチしたバイト長 (>= 0) not match: ONIG_MISMATCH ( < 0) @@ -352,6 +360,9 @@ const UChar* at, OnigRegion* region, OnigOptionType option, OnigMatchParam* mp) + 文字列の指定位置でマッチングを行い、結果とマッチ領域を返す。 + 正規表現オブジェクトの文字エンコーディングで、検索文字列として不正な文字列を渡してはいけない。 + 引数 1-6: onig_match()と同じ 7 mp: マッチパラメタ値 (match_stack_limit, retry_limit_in_match) @@ -363,6 +374,7 @@ void* callback_arg) 正規表現で文字列をスキャンして、マッチングする毎にコールバック関数を呼び出す。 + 正規表現オブジェクトの文字エンコーディングで、検索文字列として不正な文字列を渡してはいけない。 正常終了: マッチ回数 (0回も含める) エラー: エラーコード (< 0) @@ -616,14 +628,20 @@ # int onigenc_strlen(OnigEncoding enc, const UChar* s, const UChar* end) + + 文字列の文字数を返す。 + + # int onigenc_strlen_null(OnigEncoding enc, const UChar* s) 文字列の文字数を返す。 + 文字エンコーディングに対して、不正な文字列を渡してはいけない。 # int onigenc_str_bytelen_null(OnigEncoding enc, const UChar* s) 文字列のバイト数を返す。 + 文字エンコーディングに対して、不正な文字列を渡してはいけない。 # int onig_set_default_syntax(OnigSyntaxType* syntax) diff --git a/doc/UNICODE_PROPERTIES b/doc/UNICODE_PROPERTIES index 1148b4d..ff2a6ce 100644 --- a/doc/UNICODE_PROPERTIES +++ b/doc/UNICODE_PROPERTIES @@ -1,4 +1,4 @@ -Unicode Properties (from Unicode Version: 12.0.0) +Unicode Properties (from Unicode Version: 12.1.0) 15: ASCII_Hex_Digit 16: Adlam diff --git a/harnesses/ascii_compatible.dict b/harnesses/ascii_compatible.dict new file mode 100644 index 0000000..820bf47 --- /dev/null +++ b/harnesses/ascii_compatible.dict @@ -0,0 +1,111 @@ +# First-pass fuzzing dictionary for Oniguruma by Mark Griffin +"\\o{17777777777}" +"\\777" +"\\u" +"\\uFFFF" +"\\xFF" +"\\x{70000000}" +"\\C-" +"\\M-\\C-" +"\\X" +"\\p{" +"\\p{^" +"}" +"]" +")" +"\\n" +"\\r" +"\\R" +"\\W" +"\\w" +"\\s" +"\\S" +"\\d" +"\\O" +"\\X" +"\\b" +"\\y" +"\\Y" +"\\A" +"\\z" +"\\K" +"\\G" +"\\p{Print}" +"\\p{ASCII}" +"\\p{Alnum}" +"{0,2}" +"{3,}" +"{,3}" +"{5}" +"{4,2}" +"??" +"*?" +"+?" +"*+" +"{1,3}+" +"(?>" +"\\B" +"(?y{" +"[abcd1-9]" +"[\\w\\d" +"[\\p{Alphabetic}" +"[\\P{Arabic}" +"[\\x{ffff}" +"[a-w&&" +"[^" +"[:graph:]" +"[^:cntrl:]" +"(?i:" +"(?i)" +"(?m:" +"(?x:" +"(?W:" +"(?y-:" +"(?y{w}:" +"(?P:" +"(?#" +"(?:" +"(?=" +"(?!" +"(?<=" +"(?" +"(?" +"(?{" +"(?{....}[x])" +"(?{.}[x]>)" +"(?{{{.}}})" +"(?~" +"(?~a)" +"(?~|a|.*)" +"(?~|(?:a|b))" +"(?~|)" +"(?(.) |.)" +"(?('-n'))" +"(?(n+0))" +"(?(n+1))" +"(?(n-1))" +"(?())" +"(?())" +"(?())" +"(*ERROR{-2000})" +"(*COUNT[tag]{X})" +"\\1" +"\\2" +"\\k" +"\\k<1>" +"\\k<2>" +"\\k<-1>" +"\\k<-2>" +"\\k" +"\\k" +"\\k" +"\\g<-1>" +"\\g" +"name" +"(?a|b\\gc)" +"(?-i:\\g)" +"\\N{name}" +"\\p{Hiragana}" +"\\p{Katakana}" +"\\p{Emoji}" diff --git a/harnesses/deluxe-encode-harness.c b/harnesses/deluxe-encode-harness.c new file mode 100644 index 0000000..e1f84a5 --- /dev/null +++ b/harnesses/deluxe-encode-harness.c @@ -0,0 +1,239 @@ +/* + * deluxe-encode-harness.c + * contributed by Mark Griffin + */ +#include +#include "oniguruma.h" + +#include +#include + +#define DEFAULT_LIMIT 120 +typedef unsigned char uint8_t; + +static int +search(regex_t* reg, unsigned char* str, unsigned char* end) +{ + int r; + unsigned char *start, *range; + OnigRegion *region; + + region = onig_region_new(); + + start = str; + range = end; + r = onig_search(reg, str, end, start, range, region, ONIG_OPTION_NONE); + if (r >= 0) { + int i; + + fprintf(stdout, "match at %d (%s)\n", r, + ONIGENC_NAME(onig_get_encoding(reg))); + for (i = 0; i < region->num_regs; i++) { + fprintf(stdout, "%d: (%d-%d)\n", i, region->beg[i], region->end[i]); + } + } + else if (r == ONIG_MISMATCH) { + fprintf(stdout, "search fail (%s)\n", + ONIGENC_NAME(onig_get_encoding(reg))); + } + else { /* error */ + char s[ONIG_MAX_ERROR_MESSAGE_LEN]; + onig_error_code_to_str((UChar* )s, r); + fprintf(stdout, "ERROR: %s\n", s); + fprintf(stdout, " (%s)\n", ONIGENC_NAME(onig_get_encoding(reg))); + onig_region_free(region, 1 /* 1:free self, 0:free contents only */); + return -1; + } + + onig_region_free(region, 1 /* 1:free self, 0:free contents only */); + return 0; +} + +static int +exec(OnigEncoding enc, OnigOptionType options, + char* apattern, char* apattern_end, char* astr, char* astr_end) +{ + int r; + regex_t* reg; + OnigErrorInfo einfo; + UChar* pattern = (UChar* )apattern; + UChar* str = (UChar* )astr; + UChar* pattern_end = (UChar* )apattern_end; + unsigned char *end = (unsigned char* )astr_end; + + onig_initialize(&enc, 1); + onig_set_retry_limit_in_match(DEFAULT_LIMIT); + onig_set_parse_depth_limit(DEFAULT_LIMIT); + + r = onig_new(®, pattern, pattern_end, + options, enc, ONIG_SYNTAX_DEFAULT, &einfo); + if (r != ONIG_NORMAL) { + char s[ONIG_MAX_ERROR_MESSAGE_LEN]; + onig_error_code_to_str((UChar* )s, r, &einfo); + fprintf(stdout, "ERROR: %s\n", s); + onig_end(); + return -1; + } + + r = search(reg, str, end); + + onig_free(reg); + onig_end(); + return 0; +} + +static OnigCaseFoldType CF = ONIGENC_CASE_FOLD_MIN; + +static int +exec_deluxe(OnigEncoding pattern_enc, OnigEncoding str_enc, + OnigOptionType options, char* apattern, char* apattern_end, + char* astr, char* astr_end) +{ + int r; + regex_t* reg; + OnigCompileInfo ci; + OnigErrorInfo einfo; + UChar* pattern = (UChar* )apattern; + UChar* str = (UChar* )astr; + UChar* pattern_end = (UChar* )apattern_end; + unsigned char* end = (unsigned char* )astr_end; + + onig_initialize(&str_enc, 1); + onig_set_retry_limit_in_match(DEFAULT_LIMIT); + onig_set_parse_depth_limit(DEFAULT_LIMIT); + + ci.num_of_elements = 5; + ci.pattern_enc = pattern_enc; + ci.target_enc = str_enc; + ci.syntax = ONIG_SYNTAX_DEFAULT; + ci.option = options; + ci.case_fold_flag = CF; + + r = onig_new_deluxe(®, pattern, pattern_end, &ci, &einfo); + if (r != ONIG_NORMAL) { + char s[ONIG_MAX_ERROR_MESSAGE_LEN]; + onig_error_code_to_str((UChar* )s, r, &einfo); + fprintf(stdout, "ERROR: %s\n", s); + onig_end(); + return -1; + } + + if (onigenc_is_valid_mbc_string(str_enc, str, end) != 0) { + r = search(reg, str, end); + } + + onig_free(reg); + onig_end(); + return 0; +} + +#define PATTERN_SIZE 48 +#define NUM_CONTROL_BYTES 1 +#define MIN_STR_SIZE 2 +int LLVMFuzzerTestOneInput(const uint8_t * Data, size_t Size) +{ + int r; + size_t remaining_size; + unsigned char *data; + unsigned char pat_encoding_choice; + unsigned char str_encoding_choice; + unsigned char *pattern; + unsigned char *str; + unsigned char *pattern_end; + unsigned char *str_end; + unsigned int num_encodings; + OnigEncodingType *pattern_enc; + OnigEncodingType *str_enc; + + OnigEncodingType *encodings[] = { + ONIG_ENCODING_ASCII, + ONIG_ENCODING_ISO_8859_1, + ONIG_ENCODING_ISO_8859_2, + ONIG_ENCODING_ISO_8859_3, + ONIG_ENCODING_ISO_8859_4, + ONIG_ENCODING_ISO_8859_5, + ONIG_ENCODING_ISO_8859_6, + ONIG_ENCODING_ISO_8859_7, + ONIG_ENCODING_ISO_8859_8, + ONIG_ENCODING_ISO_8859_9, + ONIG_ENCODING_ISO_8859_10, + ONIG_ENCODING_ISO_8859_11, + ONIG_ENCODING_ISO_8859_13, + ONIG_ENCODING_ISO_8859_14, + ONIG_ENCODING_ISO_8859_15, + ONIG_ENCODING_ISO_8859_16, + ONIG_ENCODING_UTF8, + ONIG_ENCODING_UTF16_BE, + ONIG_ENCODING_UTF16_LE, + ONIG_ENCODING_UTF32_BE, + ONIG_ENCODING_UTF32_LE, + ONIG_ENCODING_EUC_JP, + ONIG_ENCODING_EUC_TW, + ONIG_ENCODING_EUC_KR, + ONIG_ENCODING_EUC_CN, + ONIG_ENCODING_SJIS, + //ONIG_ENCODING_KOI8, + ONIG_ENCODING_KOI8_R, + ONIG_ENCODING_CP1251, + ONIG_ENCODING_BIG5, + ONIG_ENCODING_GB18030, + }; + + if (Size <= (NUM_CONTROL_BYTES + PATTERN_SIZE + MIN_STR_SIZE)) + return 0; + if (Size > 0x1000) + return 0; + + remaining_size = Size; + data = (unsigned char *)(Data); + + // pull off bytes to switch off + pat_encoding_choice = data[0]; + data++; + remaining_size--; + str_encoding_choice = data[0]; + data++; + remaining_size--; + + // copy first PATTERN_SIZE bytes off to be the pattern + pattern = (unsigned char *)malloc(PATTERN_SIZE+4); + memset(pattern, 0, PATTERN_SIZE+4); + memcpy(pattern, data, PATTERN_SIZE); + pattern_end = pattern + PATTERN_SIZE; + data += PATTERN_SIZE; + remaining_size -= PATTERN_SIZE; + + str = (unsigned char*)malloc(remaining_size+4); + memset(str, 0, remaining_size+4); + memcpy(str, data, remaining_size); + str_end = str + remaining_size; + + num_encodings = sizeof(encodings) / sizeof(encodings[0]); + pattern_enc = encodings[pat_encoding_choice % num_encodings]; + str_enc = encodings[str_encoding_choice % num_encodings]; + + r = exec_deluxe(pattern_enc, str_enc, ONIG_OPTION_NONE, (char *)pattern, (char *)pattern_end, (char *)str, (char *)str_end); + + free(pattern); + free(str); + + return r; +} + + +#ifdef WITH_READ_MAIN + +#include + +extern int main(int argc, char* argv[]) +{ + size_t n; + uint8_t Data[10000]; + + n = read(0, Data, sizeof(Data)); + fprintf(stdout, "n: %ld\n", n); + LLVMFuzzerTestOneInput(Data, n); + + return 0; +} +#endif /* WITH_READ_MAIN */ diff --git a/harnesses/dict_conv.py b/harnesses/dict_conv.py new file mode 100644 index 0000000..f721293 --- /dev/null +++ b/harnesses/dict_conv.py @@ -0,0 +1,72 @@ +# -*- coding: utf-8 -*- +# dict_conv.py (Python3 script) + +import sys + +ENC_UTF16_BE = 1 +ENC_UTF16_LE = 2 + +def add_char(enc, s, c): + if enc == ENC_UTF16_BE: + s += "\\x00" + + s += c + if enc == ENC_UTF16_LE: + s += "\\x00" + + return s + +def conv(enc, s): + n = len(s) + r = "" + i = 0 + while i < n: + c = s[i] + if c == '\\': + c = s[i+1] + if c == '\\' or c == '"': + r = add_char(enc, r, "\\" + c) + i += 2 + continue + else: + raise("Unknown escape {0}".format(s)) + + r = add_char(enc, r, c) + i += 1 + + return r + +def main(enc): + print("# This file was generated by dict_conv.py.") + for line in sys.stdin: + s = line.strip() + if s[0] == '#': + print(s) + continue + + if s[0] == '"' and s[-1] == '"': + s = conv(enc, s[1:-1]) + print("\"{0}\"".format(s)) + else: + raise("Invalid format {0}".format(s)) + +def usage(argv): + raise RuntimeError("Usage: python {0} utf16_be/utf16_le".format(argv[0])) + + +if __name__ == "__main__": + argv = sys.argv + argc = len(argv) + + if argc >= 2: + s = argv[1] + if s == 'utf16_be': + enc = ENC_UTF16_BE + elif s == 'utf16_le': + enc = ENC_UTF16_LE + else: + usage(argv) + else: + usage(argv) + + main(enc) diff --git a/harnesses/encode-harness.c b/harnesses/encode-harness.c new file mode 100644 index 0000000..e57fd4f --- /dev/null +++ b/harnesses/encode-harness.c @@ -0,0 +1,170 @@ +/* + * encode-harness.c + * contributed by Mark Griffin + */ +#include +#include "oniguruma.h" + +#include +#include + +#define PARSE_DEPTH_LIMIT 120 +#define RETRY_LIMIT 4000 + +typedef unsigned char uint8_t; + +static int +search(regex_t* reg, unsigned char* str, unsigned char* end) +{ + int r; + unsigned char *start, *range; + OnigRegion *region; + + region = onig_region_new(); + + start = str; + range = end; + r = onig_search(reg, str, end, start, range, region, ONIG_OPTION_NONE); + if (r >= 0) { + int i; + + fprintf(stdout, "match at %d (%s)\n", r, + ONIGENC_NAME(onig_get_encoding(reg))); + for (i = 0; i < region->num_regs; i++) { + fprintf(stdout, "%d: (%d-%d)\n", i, region->beg[i], region->end[i]); + } + } + else if (r == ONIG_MISMATCH) { + fprintf(stdout, "search fail (%s)\n", + ONIGENC_NAME(onig_get_encoding(reg))); + } + else { /* error */ + char s[ONIG_MAX_ERROR_MESSAGE_LEN]; + onig_error_code_to_str((UChar* )s, r); + fprintf(stdout, "ERROR: %s\n", s); + fprintf(stdout, " (%s)\n", ONIGENC_NAME(onig_get_encoding(reg))); + onig_region_free(region, 1 /* 1:free self, 0:free contents only */); + return -1; + } + + onig_region_free(region, 1 /* 1:free self, 0:free contents only */); + return 0; +} + +static int +exec(OnigEncoding enc, OnigOptionType options, + char* apattern, char* apattern_end, char* astr, UChar* end) +{ + int r; + regex_t* reg; + OnigErrorInfo einfo; + UChar* pattern = (UChar* )apattern; + UChar* str = (UChar* )astr; + UChar* pattern_end = (UChar* )apattern_end; + + onig_initialize(&enc, 1); + onig_set_retry_limit_in_match(RETRY_LIMIT); + onig_set_parse_depth_limit(PARSE_DEPTH_LIMIT); + + r = onig_new(®, pattern, pattern_end, + options, enc, ONIG_SYNTAX_DEFAULT, &einfo); + if (r != ONIG_NORMAL) { + char s[ONIG_MAX_ERROR_MESSAGE_LEN]; + onig_error_code_to_str((UChar* )s, r, &einfo); + fprintf(stdout, "ERROR: %s\n", s); + onig_end(); + return -1; + } + + if (onigenc_is_valid_mbc_string(enc, str, end) != 0) { + r = search(reg, str, end); + } + + onig_free(reg); + onig_end(); + return 0; +} + +#define PATTERN_SIZE 32 +#define NUM_CONTROL_BYTES 1 +#define MIN_STR_SIZE 1 +int LLVMFuzzerTestOneInput(const uint8_t * Data, size_t Size) +{ + if (Size <= (NUM_CONTROL_BYTES + PATTERN_SIZE + MIN_STR_SIZE)) + return 0; + if (Size > 0x1000) + return 0; + + unsigned char *pattern_end; + unsigned char *str_null_end; + + size_t remaining_size = Size; + unsigned char *data = (unsigned char *)(Data); + + // pull off one byte to switch off + unsigned char encoding_choice = data[0]; + data++; + remaining_size--; + + // copy first PATTERN_SIZE bytes off to be the pattern + unsigned char *pattern = (unsigned char *)malloc(PATTERN_SIZE+4); + memset(pattern, 0, PATTERN_SIZE+4); + memcpy(pattern, data, PATTERN_SIZE); + pattern_end = pattern + PATTERN_SIZE; + data += PATTERN_SIZE; + remaining_size -= PATTERN_SIZE; + + unsigned char *str = (unsigned char*)malloc(remaining_size+4); + memset(str, 0, remaining_size+4); + memcpy(str, data, remaining_size); + str_null_end = str + remaining_size; + + int r; + OnigEncodingType *encodings[] = { + ONIG_ENCODING_SJIS, + ONIG_ENCODING_EUC_JP, + ONIG_ENCODING_CP1251, + ONIG_ENCODING_ISO_8859_1, + ONIG_ENCODING_UTF8, + ONIG_ENCODING_KOI8_R, + ONIG_ENCODING_BIG5 + }; + + OnigEncodingType *enc; + +#ifdef UTF16_BE + enc = ONIG_ENCODING_UTF16_BE; +#else +#ifdef UTF16_LE + enc = ONIG_ENCODING_UTF16_LE; +#else + int num_encodings = sizeof(encodings)/sizeof(encodings[0]); + enc = encodings[encoding_choice % num_encodings]; +#endif +#endif + + r = exec(enc, ONIG_OPTION_NONE, (char *)pattern, (char *)pattern_end, + (char *)str, str_null_end); + + free(pattern); + free(str); + + return r; +} + +#ifdef WITH_READ_MAIN + +#include + +extern int main(int argc, char* argv[]) +{ + size_t n; + uint8_t Data[10000]; + + n = read(0, Data, sizeof(Data)); + fprintf(stdout, "n: %ld\n", n); + LLVMFuzzerTestOneInput(Data, n); + + return 0; +} +#endif /* WITH_READ_MAIN */ diff --git a/harnesses/syntax-harness.c b/harnesses/syntax-harness.c new file mode 100644 index 0000000..0fb3587 --- /dev/null +++ b/harnesses/syntax-harness.c @@ -0,0 +1,120 @@ +/* + * syntax-harness.c + * contributed by Mark Griffin + */ +#include +#include +#include "oniguruma.h" + +#include + +#define DEFAULT_LIMIT 120 +typedef unsigned char uint8_t; + +extern int exec(OnigSyntaxType* syntax, char* apattern, char* astr) +{ + int r; + unsigned char *start, *range, *end; + regex_t* reg; + OnigErrorInfo einfo; + OnigRegion *region; + UChar* pattern = (UChar* )apattern; + UChar* str = (UChar* )astr; + + r = onig_new(®, pattern, pattern + strlen((char* )pattern), + ONIG_OPTION_DEFAULT, ONIG_ENCODING_ASCII, syntax, &einfo); + if (r != ONIG_NORMAL) { + char s[ONIG_MAX_ERROR_MESSAGE_LEN]; + onig_error_code_to_str((UChar* )s, r, &einfo); + fprintf(stdout, "ERROR: %s\n", s); + return -1; + } + + region = onig_region_new(); + + end = str + strlen((char* )str); + start = str; + range = end; + r = onig_search(reg, str, end, start, range, region, ONIG_OPTION_NONE); + if (r >= 0) { + int i; + + fprintf(stdout, "match at %d\n", r); + for (i = 0; i < region->num_regs; i++) { + fprintf(stdout, "%d: (%d-%d)\n", i, region->beg[i], region->end[i]); + } + } + else if (r == ONIG_MISMATCH) { + fprintf(stdout, "search fail\n"); + } + else { /* error */ + char s[ONIG_MAX_ERROR_MESSAGE_LEN]; + onig_error_code_to_str((UChar* )s, r); + fprintf(stdout, "ERROR: %s\n", s); + onig_region_free(region, 1 /* 1:free self, 0:free contents only */); + onig_free(reg); + return -1; + } + + onig_region_free(region, 1 /* 1:free self, 0:free contents only */); + onig_free(reg); + return 0; +} + +#define PATTERN_SIZE 64 +#define NUM_CONTROL_BYTES 1 +#define MIN_STR_SIZE 1 +int LLVMFuzzerTestOneInput(const uint8_t * Data, size_t Size) +{ + if (Size <= (NUM_CONTROL_BYTES + PATTERN_SIZE + MIN_STR_SIZE)) + return 0; + if (Size > 0x1000) + return 0; + size_t remaining_size = Size; + unsigned char *data = (unsigned char *)(Data); + + // pull off one byte to switch syntax choice + unsigned char syntax_choice = data[0]; + data++; + remaining_size--; + + // copy first PATTERN_SIZE bytes off to be the pattern + unsigned char *pattern = (unsigned char *)malloc(PATTERN_SIZE+1); + memset(pattern, 0, PATTERN_SIZE+1); + memcpy(pattern, data, PATTERN_SIZE); + data += PATTERN_SIZE; + remaining_size -= PATTERN_SIZE; + + unsigned char *str = (unsigned char*)malloc(remaining_size+1); + memset(str, 0, remaining_size+1); + memcpy(str, data, remaining_size); + + OnigEncoding use_encs[] = { ONIG_ENCODING_ASCII }; + onig_initialize(use_encs, sizeof(use_encs)/sizeof(use_encs[0])); + + onig_set_retry_limit_in_match(DEFAULT_LIMIT); + onig_set_parse_depth_limit(DEFAULT_LIMIT); + + OnigSyntaxType *syntaxes[] = { + ONIG_SYNTAX_POSIX_EXTENDED, + ONIG_SYNTAX_EMACS, + ONIG_SYNTAX_GREP, + ONIG_SYNTAX_GNU_REGEX, + ONIG_SYNTAX_JAVA, + ONIG_SYNTAX_PERL_NG, + ONIG_SYNTAX_RUBY, + ONIG_SYNTAX_ONIGURUMA, + }; + OnigSyntaxType *syntax = syntaxes[syntax_choice % 8]; + + int r; + r = exec(syntax, (char *)pattern, (char *)str); + // r = exec(ONIG_SYNTAX_JAVA, "\\p{XDigit}\\P{XDigit}[a-c&&b-g]", "bgc"); + + onig_end(); + + free(pattern); + free(str); + + return 0; +} diff --git a/index.html b/index.html index 5ad8231..58ba66d 100644 --- a/index.html +++ b/index.html @@ -8,7 +8,7 @@

Oniguruma

(Japanese)

-(c) K.Kosako, updated at: 2018/12/06 +(c) K.Kosako, updated at: 2019/08/05

@@ -16,6 +16,8 @@
What's new
    +
  • 2019/08/06: Version 6.9.3 released.
  • +
  • 2019/05/07: Version 6.9.2 released.
  • 2018/12/11: Version 6.9.1 released.
  • 2018/09/03: Version 6.9.0 released.
  • 2018/04/17: Version 6.8.2 released.
  • diff --git a/index_ja.html b/index_ja.html index 0ada788..6b75c6c 100644 --- a/index_ja.html +++ b/index_ja.html @@ -8,7 +8,7 @@

    鬼車

    -(c) K.Kosako, 最終更新: 2018/12/06 +(c) K.Kosako, 最終更新: 2019/08/05

    @@ -16,6 +16,8 @@
    更新情報
      +
    • 2019/08/06: Version 6.9.3 リリース
    • +
    • 2019/05/07: Version 6.9.2 リリース
    • 2018/12/11: Version 6.9.1 リリース
    • 2018/09/03: Version 6.9.0 リリース
    • 2018/04/17: Version 6.8.2 リリース
    • diff --git a/sample/bug_fix.c b/sample/bug_fix.c index 81c2784..3f60c5b 100644 --- a/sample/bug_fix.c +++ b/sample/bug_fix.c @@ -4,8 +4,6 @@ #include #include "oniguruma.h" -static OnigCaseFoldType CF = ONIGENC_CASE_FOLD_MIN; - static int search(regex_t* reg, unsigned char* str, unsigned char* end) { @@ -36,6 +34,7 @@ search(regex_t* reg, unsigned char* str, unsigned char* end) onig_error_code_to_str((UChar* )s, r); fprintf(stderr, "ERROR: %s\n", s); fprintf(stderr, " (%s)\n", ONIGENC_NAME(onig_get_encoding(reg))); + onig_region_free(region, 1 /* 1:free self, 0:free contents only */); return -1; } @@ -43,45 +42,6 @@ search(regex_t* reg, unsigned char* str, unsigned char* end) return 0; } -static int -exec_deluxe(OnigEncoding pattern_enc, OnigEncoding str_enc, - OnigOptionType options, char* apattern, char* astr) -{ - int r; - unsigned char *end; - regex_t* reg; - OnigCompileInfo ci; - OnigErrorInfo einfo; - UChar* pattern = (UChar* )apattern; - UChar* str = (UChar* )astr; - - onig_initialize(&str_enc, 1); - - ci.num_of_elements = 5; - ci.pattern_enc = pattern_enc; - ci.target_enc = str_enc; - ci.syntax = ONIG_SYNTAX_DEFAULT; - ci.option = options; - ci.case_fold_flag = CF; - - r = onig_new_deluxe(®, pattern, - pattern + onigenc_str_bytelen_null(pattern_enc, pattern), - &ci, &einfo); - if (r != ONIG_NORMAL) { - char s[ONIG_MAX_ERROR_MESSAGE_LEN]; - onig_error_code_to_str((UChar* )s, r, &einfo); - fprintf(stderr, "ERROR: %s\n", s); - return -1; - } - - end = str + onigenc_str_bytelen_null(str_enc, str); - r = search(reg, str, end); - - onig_free(reg); - onig_end(); - return 0; -} - static int exec(OnigEncoding enc, OnigOptionType options, char* apattern, char* astr) { @@ -92,8 +52,6 @@ exec(OnigEncoding enc, OnigOptionType options, char* apattern, char* astr) UChar* pattern = (UChar* )apattern; UChar* str = (UChar* )astr; - onig_initialize(&enc, 1); - r = onig_new(®, pattern, pattern + onigenc_str_bytelen_null(enc, pattern), options, enc, ONIG_SYNTAX_DEFAULT, &einfo); @@ -108,7 +66,6 @@ exec(OnigEncoding enc, OnigOptionType options, char* apattern, char* astr) r = search(reg, str, end); onig_free(reg); - onig_end(); return 0; } @@ -116,16 +73,21 @@ exec(OnigEncoding enc, OnigOptionType options, char* apattern, char* astr) extern int main(int argc, char* argv[]) { + OnigEncoding use_encs[1]; + + use_encs[0] = ONIG_ENCODING_UTF8; + onig_initialize(use_encs, 1); + /* fix ignore case in look-behind commit: 3340ec2cc5627172665303fe248c9793354d2251 */ - exec_deluxe(ONIG_ENCODING_UTF8, ONIG_ENCODING_UTF8, - ONIG_OPTION_IGNORECASE, - "(?<=\305\211)a", "\312\274na"); /* \u{0149}a \u{02bc}na */ + exec(ONIG_ENCODING_UTF8, ONIG_OPTION_IGNORECASE, + "(?<=\305\211)a", "\312\274na"); /* \u{0149}a \u{02bc}na */ exec(ONIG_ENCODING_UTF8, ONIG_OPTION_NONE, "(\\2)(\\1)", "aa"); /* fail. */ exec(ONIG_ENCODING_UTF8, ONIG_OPTION_FIND_LONGEST, "a*", "aa aaa aaaa aaaaa "); /* match 12-17 */ + onig_end(); return 0; } diff --git a/sample/crnl.c b/sample/crnl.c index 3ad1210..bfa563e 100644 --- a/sample/crnl.c +++ b/sample/crnl.c @@ -65,6 +65,8 @@ x(int no, char* pattern_arg, char* str_arg, char s[ONIG_MAX_ERROR_MESSAGE_LEN]; onig_error_code_to_str(s, r); fprintf(stderr, "ERROR: %s\n", s); + onig_region_free(region, 1 /* 1:free self, 0:free contents only */); + onig_free(reg); return -1; } diff --git a/sample/encode.c b/sample/encode.c index 8a03ab8..c5d4771 100644 --- a/sample/encode.c +++ b/sample/encode.c @@ -34,6 +34,7 @@ search(regex_t* reg, unsigned char* str, unsigned char* end) onig_error_code_to_str((UChar* )s, r); fprintf(stderr, "ERROR: %s\n", s); fprintf(stderr, " (%s)\n", ONIGENC_NAME(onig_get_encoding(reg))); + onig_region_free(region, 1 /* 1:free self, 0:free contents only */); return -1; } @@ -72,55 +73,6 @@ exec(OnigEncoding enc, OnigOptionType options, return 0; } -static OnigCaseFoldType CF = ONIGENC_CASE_FOLD_MIN; - -#if 0 -static void -set_case_fold(OnigCaseFoldType cf) -{ - CF = cf; -} -#endif - -static int -exec_deluxe(OnigEncoding pattern_enc, OnigEncoding str_enc, - OnigOptionType options, char* apattern, char* astr) -{ - int r; - unsigned char *end; - regex_t* reg; - OnigCompileInfo ci; - OnigErrorInfo einfo; - UChar* pattern = (UChar* )apattern; - UChar* str = (UChar* )astr; - - onig_initialize(&str_enc, 1); - - ci.num_of_elements = 5; - ci.pattern_enc = pattern_enc; - ci.target_enc = str_enc; - ci.syntax = ONIG_SYNTAX_DEFAULT; - ci.option = options; - ci.case_fold_flag = CF; - - r = onig_new_deluxe(®, pattern, - pattern + onigenc_str_bytelen_null(pattern_enc, pattern), - &ci, &einfo); - if (r != ONIG_NORMAL) { - char s[ONIG_MAX_ERROR_MESSAGE_LEN]; - onig_error_code_to_str((UChar* )s, r, &einfo); - fprintf(stderr, "ERROR: %s\n", s); - return -1; - } - - end = str + onigenc_str_bytelen_null(str_enc, str); - r = search(reg, str, end); - - onig_free(reg); - onig_end(); - return 0; -} - extern int main(int argc, char* argv[]) { int r; @@ -196,39 +148,6 @@ extern int main(int argc, char* argv[]) r = exec(ONIG_ENCODING_ISO_8859_1, ONIG_OPTION_IGNORECASE, "is", "iss"); - r = exec_deluxe(ONIG_ENCODING_ASCII, ONIG_ENCODING_UTF16_BE, - ONIG_OPTION_NONE, "a+", - "\000b\000a\000a\000a\000c\000c\000\000"); - - r = exec_deluxe(ONIG_ENCODING_ASCII, ONIG_ENCODING_UTF16_LE, - ONIG_OPTION_NONE, "a+", - "b\000a\000a\000a\000a\000c\000\000\000"); - - r = exec_deluxe(ONIG_ENCODING_UTF16_BE, ONIG_ENCODING_UTF16_LE, - ONIG_OPTION_NONE, - "\000b\000a\000a\000a\000c\000c\000\000", - "x\000b\000a\000a\000a\000c\000c\000\000\000"); - - r = exec_deluxe(ONIG_ENCODING_ISO_8859_1, ONIG_ENCODING_UTF16_BE, - ONIG_OPTION_IGNORECASE, - "\337", "\000S\000S\000\000"); - - r = exec_deluxe(ONIG_ENCODING_ISO_8859_1, ONIG_ENCODING_UTF16_BE, - ONIG_OPTION_IGNORECASE, - "SS", "\000\337\000\000"); - - r = exec_deluxe(ONIG_ENCODING_ISO_8859_1, ONIG_ENCODING_UTF16_LE, - ONIG_OPTION_IGNORECASE, - "\337", "S\000S\000\000\000"); - - r = exec_deluxe(ONIG_ENCODING_ISO_8859_1, ONIG_ENCODING_UTF32_BE, - ONIG_OPTION_IGNORECASE, - "SS", "\000\000\000\337\000\000\000\000"); - - r = exec_deluxe(ONIG_ENCODING_ISO_8859_1, ONIG_ENCODING_UTF32_LE, - ONIG_OPTION_IGNORECASE, - "\337", "S\000\000\000S\000\000\000\000\000\000\000"); - r = exec(ONIG_ENCODING_UTF16_BE, ONIG_OPTION_NONE, "\000[\000[\000:\000a\000l\000n\000u\000m\000:\000]\000]\000+\000\000", "\000#\002\120\000a\000Z\012\077\012\076\012\075\000\000"); @@ -242,44 +161,34 @@ extern int main(int argc, char* argv[]) r = exec(ONIG_ENCODING_GB18030, ONIG_OPTION_IGNORECASE, "(Aa\\d)+", "BaA5Aa0234"); - r = exec_deluxe(ONIG_ENCODING_ISO_8859_1, ONIG_ENCODING_UTF16_BE, - ONIG_OPTION_NONE, - "^\\P{Hiragana}\\p{^Hiragana}(\\p{Hiragana}+)$", - "\060\100\060\240\060\101\060\102\060\226\060\237\000\000"); - - r = exec_deluxe(ONIG_ENCODING_UTF16_BE, ONIG_ENCODING_UTF16_BE, - ONIG_OPTION_IGNORECASE, - "\000[\000\337\000]\000\000", "\000S\000S\000\000"); + r = exec(ONIG_ENCODING_UTF16_BE, ONIG_OPTION_IGNORECASE, + "\000[\000\337\000]\000\000", "\000S\000S\000\000"); - r = exec_deluxe(ONIG_ENCODING_UTF16_BE, ONIG_ENCODING_UTF16_BE, - ONIG_OPTION_IGNORECASE, - "\000[\000\337\000]\000\000", "\000s\000S\000\000"); + r = exec(ONIG_ENCODING_UTF16_BE, ONIG_OPTION_IGNORECASE, + "\000[\000\337\000]\000\000", "\000s\000S\000\000"); - r = exec_deluxe(ONIG_ENCODING_UTF16_BE, ONIG_ENCODING_UTF16_BE, - ONIG_OPTION_IGNORECASE, - "\000^\000[\000\001\000-\377\375\000]\000$\000\000", - "\000s\000S\000\000"); + r = exec(ONIG_ENCODING_UTF16_BE, ONIG_OPTION_IGNORECASE, + "\000^\000[\000\001\000-\377\375\000]\000$\000\000", + "\000s\000S\000\000"); - r = exec_deluxe(ONIG_ENCODING_UTF16_BE, ONIG_ENCODING_UTF16_BE, - ONIG_OPTION_IGNORECASE, - "\000S\000S\000\000", - "\000S\000T\000\337\000\000"); + r = exec(ONIG_ENCODING_UTF16_BE, ONIG_OPTION_IGNORECASE, + "\000S\000S\000\000", + "\000S\000T\000\337\000\000"); - r = exec_deluxe(ONIG_ENCODING_UTF16_BE, ONIG_ENCODING_UTF16_BE, - ONIG_OPTION_IGNORECASE, - "\000S\000T\000S\000S\000\000", - "\000S\000t\000s\000S\000\000"); + r = exec(ONIG_ENCODING_UTF16_BE, ONIG_OPTION_IGNORECASE, + "\000S\000T\000S\000S\000\000", + "\000S\000t\000s\000S\000\000"); { UChar pat[] = { 0x1f, 0xfc, 0x00, 0x00 }; UChar str1[] = { 0x21, 0x26, 0x1f, 0xbe, 0x00, 0x00 }; UChar str2[] = { 0x1f, 0xf3, 0x00, 0x00 }; - r = exec_deluxe(ONIG_ENCODING_UTF16_BE, ONIG_ENCODING_UTF16_BE, - ONIG_OPTION_IGNORECASE, (char* )pat, (char* )str1); + r = exec(ONIG_ENCODING_UTF16_BE, ONIG_OPTION_IGNORECASE, + (char* )pat, (char* )str1); - r = exec_deluxe(ONIG_ENCODING_UTF16_BE, ONIG_ENCODING_UTF16_BE, - ONIG_OPTION_IGNORECASE, (char* )pat, (char* )str2); + r = exec(ONIG_ENCODING_UTF16_BE, ONIG_OPTION_IGNORECASE, + (char* )pat, (char* )str2); } #if 0 @@ -287,17 +196,14 @@ extern int main(int argc, char* argv[]) set_case_fold(ONIGENC_CASE_FOLD_TURKISH_AZERI); - r = exec_deluxe(ONIG_ENCODING_UTF8, ONIG_ENCODING_UTF8, - ONIG_OPTION_IGNORECASE, - "Ii", "\304\261\304\260"); + r = exec(ONIG_ENCODING_UTF8, ONIG_ENCODING_UTF8, ONIG_OPTION_IGNORECASE, + "Ii", "\304\261\304\260"); - r = exec_deluxe(ONIG_ENCODING_UTF16_BE, ONIG_ENCODING_UTF16_BE, - ONIG_OPTION_IGNORECASE, - "\000I\000i\000\000", "\001\061\001\060\000\000"); + r = exec(ONIG_ENCODING_UTF16_BE, ONIG_OPTION_IGNORECASE, + "\000I\000i\000\000", "\001\061\001\060\000\000"); - r = exec_deluxe(ONIG_ENCODING_UTF16_BE, ONIG_ENCODING_UTF16_BE, - ONIG_OPTION_IGNORECASE, - "\001\061\001\060\000\000", "\000I\000i\000\000"); + r = exec(ONIG_ENCODING_UTF16_BE, ONIG_OPTION_IGNORECASE, + "\001\061\001\060\000\000", "\000I\000i\000\000"); set_case_fold(ONIGENC_CASE_FOLD_MIN); #endif diff --git a/sample/listcap.c b/sample/listcap.c index e0fe23a..a73f7d4 100644 --- a/sample/listcap.c +++ b/sample/listcap.c @@ -69,6 +69,8 @@ extern int ex(unsigned char* str, unsigned char* pattern, else { /* error */ char s[ONIG_MAX_ERROR_MESSAGE_LEN]; onig_error_code_to_str((UChar* )s, r); + onig_region_free(region, 1 /* 1:free self, 0:free contents only */); + onig_free(reg); return -1; } diff --git a/sample/names.c b/sample/names.c index a838056..9b1eb24 100644 --- a/sample/names.c +++ b/sample/names.c @@ -65,6 +65,9 @@ extern int main(int argc, char* argv[]) else { /* error */ char s[ONIG_MAX_ERROR_MESSAGE_LEN]; onig_error_code_to_str((UChar* )s, r); + onig_region_free(region, 1 /* 1:free self, 0:free contents only */); + onig_free(reg); + onig_end(); return -1; } diff --git a/sample/posix.c b/sample/posix.c index 35ccb68..c555936 100644 --- a/sample/posix.c +++ b/sample/posix.c @@ -49,6 +49,7 @@ extern int main(int argc, char* argv[]) regerror(r, ®, buf, sizeof(buf)); fprintf(stderr, "ERROR: %s\n", buf); regfree(®); + onig_end(); return -1; } x(®, pattern, (UChar* )"aaabbbbd"); @@ -60,6 +61,7 @@ extern int main(int argc, char* argv[]) regerror(r, ®, buf, sizeof(buf)); fprintf(stderr, "ERROR: %s\n", buf); regfree(®); + onig_end(); return -1; } x(®, pattern, (UChar* )"a+b{2,7}d?|uuu"); @@ -71,6 +73,7 @@ extern int main(int argc, char* argv[]) regerror(r, ®, buf, sizeof(buf)); fprintf(stderr, "ERROR: %s\n", buf); regfree(®); + onig_end(); return -1; } x(®, pattern, (UChar* )"aaaabbbbbbd"); @@ -83,6 +86,7 @@ extern int main(int argc, char* argv[]) regerror(r, ®, buf, sizeof(buf)); fprintf(stderr, "ERROR: %s\n", buf); regfree(®); + onig_end(); return -1; } x(®, pattern, (UChar* )"aaabbbbd)"); @@ -93,6 +97,7 @@ extern int main(int argc, char* argv[]) regerror(r, ®, buf, sizeof(buf)); fprintf(stderr, "ERROR: %s\n", buf); regfree(®); + onig_end(); return -1; } x(®, pattern, (UChar* )"a\nb\n"); diff --git a/sample/scan.c b/sample/scan.c index ad5ae74..4039e46 100644 --- a/sample/scan.c +++ b/sample/scan.c @@ -36,6 +36,7 @@ scan(regex_t* reg, unsigned char* str, unsigned char* end) char s[ONIG_MAX_ERROR_MESSAGE_LEN]; onig_error_code_to_str((OnigUChar* )s, r); fprintf(stderr, "ERROR: %s\n", s); + onig_region_free(region, 1 /* 1:free self, 0:free contents only */); return -1; } @@ -63,6 +64,7 @@ exec(OnigEncoding enc, OnigOptionType options, char* apattern, char* astr) char s[ONIG_MAX_ERROR_MESSAGE_LEN]; onig_error_code_to_str((OnigUChar* )s, r, &einfo); fprintf(stderr, "ERROR: %s\n", s); + onig_end(); return -1; } diff --git a/sample/simple.c b/sample/simple.c index 95110b8..5a14042 100644 --- a/sample/simple.c +++ b/sample/simple.c @@ -49,6 +49,9 @@ extern int main(int argc, char* argv[]) char s[ONIG_MAX_ERROR_MESSAGE_LEN]; onig_error_code_to_str((UChar* )s, r); fprintf(stderr, "ERROR: %s\n", s); + onig_region_free(region, 1 /* 1:free self, 0:free contents only */); + onig_free(reg); + onig_end(); return -1; } diff --git a/sample/sql.c b/sample/sql.c index 8e95f70..1602ac9 100644 --- a/sample/sql.c +++ b/sample/sql.c @@ -42,6 +42,7 @@ extern int main(int argc, char* argv[]) char s[ONIG_MAX_ERROR_MESSAGE_LEN]; onig_error_code_to_str((UChar* )s, r, &einfo); fprintf(stderr, "ERROR: %s\n", s); + onig_end(); return -1; } @@ -66,6 +67,9 @@ extern int main(int argc, char* argv[]) char s[ONIG_MAX_ERROR_MESSAGE_LEN]; onig_error_code_to_str((UChar* )s, r); fprintf(stderr, "ERROR: %s\n", s); + onig_region_free(region, 1 /* 1:free self, 0:free contents only */); + onig_free(reg); + onig_end(); return -1; } diff --git a/sample/syntax.c b/sample/syntax.c index e292079..e034608 100644 --- a/sample/syntax.c +++ b/sample/syntax.c @@ -45,6 +45,8 @@ extern int exec(OnigSyntaxType* syntax, char* apattern, char* astr) char s[ONIG_MAX_ERROR_MESSAGE_LEN]; onig_error_code_to_str((UChar* )s, r); fprintf(stderr, "ERROR: %s\n", s); + onig_region_free(region, 1 /* 1:free self, 0:free contents only */); + onig_free(reg); return -1; } diff --git a/sample/user_property.c b/sample/user_property.c index 8b2abd2..d52adc0 100644 --- a/sample/user_property.c +++ b/sample/user_property.c @@ -40,6 +40,7 @@ main(int argc, char* argv[]) char s[ONIG_MAX_ERROR_MESSAGE_LEN]; onig_error_code_to_str((UChar* )s, r); fprintf(stderr, "ERROR: %s\n", s); + onig_end(); return -1; } @@ -52,6 +53,7 @@ main(int argc, char* argv[]) char s[ONIG_MAX_ERROR_MESSAGE_LEN]; onig_error_code_to_str((UChar* )s, r, &einfo); fprintf(stderr, "onig_new: ERROR: %s\n", s); + onig_end(); return -1; } @@ -76,6 +78,9 @@ main(int argc, char* argv[]) char s[ONIG_MAX_ERROR_MESSAGE_LEN]; onig_error_code_to_str((UChar* )s, r); fprintf(stderr, "ERROR: %s\n", s); + onig_region_free(region, 1 /* 1:free self, 0:free contents only */); + onig_free(reg); + onig_end(); return -1; } diff --git a/src/gb18030.c b/src/gb18030.c index 7654432..8d415b0 100644 --- a/src/gb18030.c +++ b/src/gb18030.c @@ -2,7 +2,7 @@ gb18030.c - Oniguruma (regular expression library) **********************************************************************/ /*- - * Copyright (c) 2005-2018 KUBO Takehiro + * Copyright (c) 2005-2019 KUBO Takehiro * K.Kosako * All rights reserved. * @@ -67,11 +67,11 @@ gb18030_mbc_enc_len(const UChar* p) { if (GB18030_MAP[*p] != CM) return 1; + p++; if (GB18030_MAP[*p] == C4) return 4; - if (GB18030_MAP[*p] == C1) - return 1; /* illegal sequence */ + return 2; } diff --git a/src/oniguruma.h b/src/oniguruma.h index f6aa5ba..90cf2d9 100644 --- a/src/oniguruma.h +++ b/src/oniguruma.h @@ -36,9 +36,9 @@ extern "C" { #define ONIGURUMA #define ONIGURUMA_VERSION_MAJOR 6 #define ONIGURUMA_VERSION_MINOR 9 -#define ONIGURUMA_VERSION_TEENY 2 +#define ONIGURUMA_VERSION_TEENY 3 -#define ONIGURUMA_VERSION_INT 60902 +#define ONIGURUMA_VERSION_INT 60903 #ifndef P_ #if defined(__STDC__) || defined(_WIN32) @@ -52,6 +52,7 @@ extern "C" { # define PV_(args) args #endif +#ifndef ONIG_STATIC #ifndef ONIG_EXTERN #if defined(_WIN32) && !defined(__GNUC__) #if defined(ONIGURUMA_EXPORT) @@ -65,6 +66,9 @@ extern "C" { #ifndef ONIG_EXTERN #define ONIG_EXTERN extern #endif +#else +#define ONIG_EXTERN extern +#endif /* PART: character encoding */ @@ -517,6 +521,7 @@ ONIG_EXTERN OnigSyntaxType* OnigDefaultSyntax; #define ONIG_SYN_BACKSLASH_ESCAPE_IN_CC (1U<<21) /* [..\w..] etc.. */ #define ONIG_SYN_ALLOW_EMPTY_RANGE_IN_CC (1U<<22) #define ONIG_SYN_ALLOW_DOUBLE_RANGE_OP_IN_CC (1U<<23) /* [0-9-a]=[0-9\-a] */ +#define ONIG_SYN_ALLOW_INVALID_CODE_END_OF_RANGE_IN_CC (1U<<26) /* syntax (behavior) warning */ #define ONIG_SYN_WARN_CC_OP_NOT_ESCAPED (1U<<24) /* [,-,] */ #define ONIG_SYN_WARN_REDUNDANT_NESTED_REPEAT (1U<<25) /* (?:a*)+ */ @@ -766,6 +771,8 @@ int onig_init P_((void)); ONIG_EXTERN int onig_error_code_to_str PV_((OnigUChar* s, int err_code, ...)); ONIG_EXTERN +int onig_is_error_code_needs_param PV_((int code)); +ONIG_EXTERN void onig_set_warn_func P_((OnigWarnFunc f)); ONIG_EXTERN void onig_set_verb_warn_func P_((OnigWarnFunc f)); diff --git a/src/regcomp.c b/src/regcomp.c index c2c04a4..b96c793 100644 --- a/src/regcomp.c +++ b/src/regcomp.c @@ -599,12 +599,34 @@ select_str_opcode(int mb_len, int str_len, int ignore_case) } static int -compile_tree_empty_check(Node* node, regex_t* reg, int empty_info, ScanEnv* env) +is_strict_real_node(Node* node) +{ + switch (NODE_TYPE(node)) { + case NODE_STRING: + { + StrNode* sn = STR_(node); + return (sn->end != sn->s); + } + break; + + case NODE_CCLASS: + case NODE_CTYPE: + return 1; + break; + + default: + return 0; + break; + } +} + +static int +compile_tree_empty_check(Node* node, regex_t* reg, int emptiness, ScanEnv* env) { int r; int saved_num_null_check = reg->num_null_check; - if (empty_info != BODY_IS_NOT_EMPTY) { + if (emptiness != BODY_IS_NOT_EMPTY) { r = add_op(reg, OP_EMPTY_CHECK_START); if (r != 0) return r; COP(reg)->empty_check_start.mem = reg->num_null_check; /* NULL CHECK ID */ @@ -614,12 +636,12 @@ compile_tree_empty_check(Node* node, regex_t* reg, int empty_info, ScanEnv* env) r = compile_tree(node, reg, env); if (r != 0) return r; - if (empty_info != BODY_IS_NOT_EMPTY) { - if (empty_info == BODY_IS_EMPTY) + if (emptiness != BODY_IS_NOT_EMPTY) { + if (emptiness == BODY_IS_EMPTY_POSSIBILITY) r = add_op(reg, OP_EMPTY_CHECK_END); - else if (empty_info == BODY_IS_EMPTY_MEM) + else if (emptiness == BODY_IS_EMPTY_POSSIBILITY_MEM) r = add_op(reg, OP_EMPTY_CHECK_END_MEMST); - else if (empty_info == BODY_IS_EMPTY_REC) + else if (emptiness == BODY_IS_EMPTY_POSSIBILITY_REC) r = add_op(reg, OP_EMPTY_CHECK_END_MEMST_PUSH); if (r != 0) return r; @@ -895,12 +917,12 @@ entry_repeat_range(regex_t* reg, int id, int lower, int upper) } p[id].lower = lower; - p[id].upper = (IS_REPEAT_INFINITE(upper) ? 0x7fffffff : upper); + p[id].upper = (IS_INFINITE_REPEAT(upper) ? 0x7fffffff : upper); return 0; } static int -compile_range_repeat_node(QuantNode* qn, int target_len, int empty_info, +compile_range_repeat_node(QuantNode* qn, int target_len, int emptiness, regex_t* reg, ScanEnv* env) { int r; @@ -915,7 +937,7 @@ compile_range_repeat_node(QuantNode* qn, int target_len, int empty_info, r = entry_repeat_range(reg, num_repeat, qn->lower, qn->upper); if (r != 0) return r; - r = compile_tree_empty_check(NODE_QUANT_BODY(qn), reg, empty_info, env); + r = compile_tree_empty_check(NODE_QUANT_BODY(qn), reg, emptiness, env); if (r != 0) return r; if ( @@ -937,7 +959,7 @@ compile_range_repeat_node(QuantNode* qn, int target_len, int empty_info, static int is_anychar_infinite_greedy(QuantNode* qn) { - if (qn->greedy && IS_REPEAT_INFINITE(qn->upper) && + if (qn->greedy && IS_INFINITE_REPEAT(qn->upper) && NODE_IS_ANYCHAR(NODE_QUANT_BODY(qn))) return 1; else @@ -951,8 +973,8 @@ static int compile_length_quantifier_node(QuantNode* qn, regex_t* reg) { int len, mod_tlen; - int infinite = IS_REPEAT_INFINITE(qn->upper); - enum BodyEmpty empty_info = qn->empty_info; + int infinite = IS_INFINITE_REPEAT(qn->upper); + enum BodyEmptyType emptiness = qn->emptiness; int tlen = compile_length_tree(NODE_QUANT_BODY(qn), reg); if (tlen < 0) return tlen; @@ -969,10 +991,9 @@ compile_length_quantifier_node(QuantNode* qn, regex_t* reg) } } - if (empty_info == BODY_IS_NOT_EMPTY) - mod_tlen = tlen; - else - mod_tlen = tlen + (SIZE_OP_EMPTY_CHECK_START + SIZE_OP_EMPTY_CHECK_END); + mod_tlen = tlen; + if (emptiness != BODY_IS_NOT_EMPTY) + mod_tlen += SIZE_OP_EMPTY_CHECK_START + SIZE_OP_EMPTY_CHECK_END; if (infinite && (qn->lower <= 1 || @@ -1026,8 +1047,8 @@ static int compile_quantifier_node(QuantNode* qn, regex_t* reg, ScanEnv* env) { int i, r, mod_tlen; - int infinite = IS_REPEAT_INFINITE(qn->upper); - enum BodyEmpty empty_info = qn->empty_info; + int infinite = IS_INFINITE_REPEAT(qn->upper); + enum BodyEmptyType emptiness = qn->emptiness; int tlen = compile_length_tree(NODE_QUANT_BODY(qn), reg); if (tlen < 0) return tlen; @@ -1055,10 +1076,9 @@ compile_quantifier_node(QuantNode* qn, regex_t* reg, ScanEnv* env) } } - if (empty_info == BODY_IS_NOT_EMPTY) - mod_tlen = tlen; - else - mod_tlen = tlen + (SIZE_OP_EMPTY_CHECK_START + SIZE_OP_EMPTY_CHECK_END); + mod_tlen = tlen; + if (emptiness != BODY_IS_NOT_EMPTY) + mod_tlen += SIZE_OP_EMPTY_CHECK_START + SIZE_OP_EMPTY_CHECK_END; if (infinite && (qn->lower <= 1 || @@ -1096,7 +1116,7 @@ compile_quantifier_node(QuantNode* qn, regex_t* reg, ScanEnv* env) COP(reg)->push_or_jump_exact1.addr = SIZE_INC_OP + mod_tlen + SIZE_OP_JUMP; COP(reg)->push_or_jump_exact1.c = STR_(qn->head_exact)->s[0]; - r = compile_tree_empty_check(NODE_QUANT_BODY(qn), reg, empty_info, env); + r = compile_tree_empty_check(NODE_QUANT_BODY(qn), reg, emptiness, env); if (r != 0) return r; addr = -(mod_tlen + (int )SIZE_OP_PUSH_OR_JUMP_EXACT1); @@ -1109,7 +1129,7 @@ compile_quantifier_node(QuantNode* qn, regex_t* reg, ScanEnv* env) COP(reg)->push_if_peek_next.addr = SIZE_INC_OP + mod_tlen + SIZE_OP_JUMP; COP(reg)->push_if_peek_next.c = STR_(qn->next_head_exact)->s[0]; - r = compile_tree_empty_check(NODE_QUANT_BODY(qn), reg, empty_info, env); + r = compile_tree_empty_check(NODE_QUANT_BODY(qn), reg, emptiness, env); if (r != 0) return r; addr = -(mod_tlen + (int )SIZE_OP_PUSH_IF_PEEK_NEXT); @@ -1119,7 +1139,7 @@ compile_quantifier_node(QuantNode* qn, regex_t* reg, ScanEnv* env) if (r != 0) return r; COP(reg)->push.addr = SIZE_INC_OP + mod_tlen + SIZE_OP_JUMP; - r = compile_tree_empty_check(NODE_QUANT_BODY(qn), reg, empty_info, env); + r = compile_tree_empty_check(NODE_QUANT_BODY(qn), reg, emptiness, env); if (r != 0) return r; addr = -(mod_tlen + (int )SIZE_OP_PUSH); @@ -1134,7 +1154,7 @@ compile_quantifier_node(QuantNode* qn, regex_t* reg, ScanEnv* env) if (r != 0) return r; COP(reg)->jump.addr = mod_tlen + SIZE_INC_OP; - r = compile_tree_empty_check(NODE_QUANT_BODY(qn), reg, empty_info, env); + r = compile_tree_empty_check(NODE_QUANT_BODY(qn), reg, emptiness, env); if (r != 0) return r; r = add_op(reg, OP_PUSH); @@ -1188,7 +1208,7 @@ compile_quantifier_node(QuantNode* qn, regex_t* reg, ScanEnv* env) r = compile_tree(NODE_QUANT_BODY(qn), reg, env); } else { - r = compile_range_repeat_node(qn, mod_tlen, empty_info, reg, env); + r = compile_range_repeat_node(qn, mod_tlen, emptiness, reg, env); } return r; } @@ -1273,7 +1293,7 @@ compile_length_bag_node(BagNode* node, regex_t* reg) break; case BAG_STOP_BACKTRACK: - if (NODE_IS_STOP_BT_SIMPLE_REPEAT(node)) { + if (NODE_IS_STRICT_REAL_REPEAT(node)) { int v; QuantNode* qn; @@ -1307,8 +1327,9 @@ compile_length_bag_node(BagNode* node, regex_t* reg) len += tlen; } + len += SIZE_OP_JUMP + SIZE_OP_ATOMIC_END; + if (IS_NOT_NULL(Else)) { - len += SIZE_OP_JUMP; tlen = compile_length_tree(Else, reg); if (tlen < 0) return tlen; len += tlen; @@ -1423,7 +1444,7 @@ compile_bag_node(BagNode* node, regex_t* reg, ScanEnv* env) break; case BAG_STOP_BACKTRACK: - if (NODE_IS_STOP_BT_SIMPLE_REPEAT(node)) { + if (NODE_IS_STRICT_REAL_REPEAT(node)) { QuantNode* qn = QUANT_(NODE_BAG_BODY(node)); r = compile_tree_n_times(NODE_QUANT_BODY(qn), qn->lower, reg, env); if (r != 0) return r; @@ -1455,7 +1476,7 @@ compile_bag_node(BagNode* node, regex_t* reg, ScanEnv* env) case BAG_IF_ELSE: { - int cond_len, then_len, jump_len; + int cond_len, then_len, else_len, jump_len; Node* cond = NODE_BAG_BODY(node); Node* Then = node->te.Then; Node* Else = node->te.Else; @@ -1472,8 +1493,7 @@ compile_bag_node(BagNode* node, regex_t* reg, ScanEnv* env) else then_len = 0; - jump_len = cond_len + then_len + SIZE_OP_ATOMIC_END; - if (IS_NOT_NULL(Else)) jump_len += SIZE_OP_JUMP; + jump_len = cond_len + then_len + SIZE_OP_ATOMIC_END + SIZE_OP_JUMP; r = add_op(reg, OP_PUSH); if (r != 0) return r; @@ -1490,11 +1510,20 @@ compile_bag_node(BagNode* node, regex_t* reg, ScanEnv* env) } if (IS_NOT_NULL(Else)) { - int else_len = compile_length_tree(Else, reg); - r = add_op(reg, OP_JUMP); - if (r != 0) return r; - COP(reg)->jump.addr = else_len + SIZE_INC_OP; + else_len = compile_length_tree(Else, reg); + if (else_len < 0) return else_len; + } + else + else_len = 0; + + r = add_op(reg, OP_JUMP); + if (r != 0) return r; + COP(reg)->jump.addr = SIZE_OP_ATOMIC_END + else_len + SIZE_INC_OP; + r = add_op(reg, OP_ATOMIC_END); + if (r != 0) return r; + + if (IS_NOT_NULL(Else)) { r = compile_tree(Else, reg, env); } } @@ -3035,7 +3064,7 @@ tree_max_len(Node* node, ScanEnv* env) if (qn->upper != 0) { len = tree_max_len(NODE_BODY(node), env); if (len != 0) { - if (! IS_REPEAT_INFINITE(qn->upper)) + if (! IS_INFINITE_REPEAT(qn->upper)) len = distance_multiply(len, qn->upper); else len = INFINITE_LEN; @@ -3581,7 +3610,7 @@ next_setup(Node* node, Node* next_node, regex_t* reg) type = NODE_TYPE(node); if (type == NODE_QUANT) { QuantNode* qn = QUANT_(node); - if (qn->greedy && IS_REPEAT_INFINITE(qn->upper)) { + if (qn->greedy && IS_INFINITE_REPEAT(qn->upper)) { #ifdef USE_QUANT_PEEK_NEXT Node* n = get_head_value_node(next_node, 1, reg); /* '\0': for UTF-16BE etc... */ @@ -3591,7 +3620,7 @@ next_setup(Node* node, Node* next_node, regex_t* reg) #endif /* automatic posseivation a*b ==> (?>a*)b */ if (qn->lower <= 1) { - if (NODE_IS_SIMPLE_TYPE(NODE_BODY(node))) { + if (is_strict_real_node(NODE_BODY(node))) { Node *x, *y; x = get_head_value_node(NODE_BODY(node), 0, reg); if (IS_NOT_NULL(x)) { @@ -3599,7 +3628,7 @@ next_setup(Node* node, Node* next_node, regex_t* reg) if (IS_NOT_NULL(y) && is_exclusive(x, y, reg)) { Node* en = onig_node_new_bag(BAG_STOP_BACKTRACK); CHECK_NULL_RETURN_MEMERR(en); - NODE_STATUS_ADD(en, STOP_BT_SIMPLE_REPEAT); + NODE_STATUS_ADD(en, STRICT_REAL_REPEAT); swap_node(node, en); NODE_BODY(node) = en; } @@ -4001,11 +4030,11 @@ expand_case_fold_string(Node* node, regex_t* reg, int state) return r; } -#ifdef USE_INSISTENT_CHECK_CAPTURES_IN_EMPTY_REPEAT -static enum BodyEmpty +#ifdef USE_STUBBORN_CHECK_CAPTURES_IN_EMPTY_REPEAT +static enum BodyEmptyType quantifiers_memory_node_info(Node* node) { - int r = BODY_IS_EMPTY; + int r = BODY_IS_EMPTY_POSSIBILITY; switch (NODE_TYPE(node)) { case NODE_LIST: @@ -4022,7 +4051,7 @@ quantifiers_memory_node_info(Node* node) #ifdef USE_CALL case NODE_CALL: if (NODE_IS_RECURSION(node)) { - return BODY_IS_EMPTY_REC; /* tiny version */ + return BODY_IS_EMPTY_POSSIBILITY_REC; /* tiny version */ } else r = quantifiers_memory_node_info(NODE_BODY(node)); @@ -4044,9 +4073,9 @@ quantifiers_memory_node_info(Node* node) switch (en->type) { case BAG_MEMORY: if (NODE_IS_RECURSION(node)) { - return BODY_IS_EMPTY_REC; + return BODY_IS_EMPTY_POSSIBILITY_REC; } - return BODY_IS_EMPTY_MEM; + return BODY_IS_EMPTY_POSSIBILITY_MEM; break; case BAG_OPTION: @@ -4083,7 +4112,7 @@ quantifiers_memory_node_info(Node* node) return r; } -#endif /* USE_INSISTENT_CHECK_CAPTURES_IN_EMPTY_REPEAT */ +#endif /* USE_STUBBORN_CHECK_CAPTURES_IN_EMPTY_REPEAT */ #ifdef USE_CALL @@ -4351,7 +4380,7 @@ setup_called_state_call(Node* node, int state) { QuantNode* qn = QUANT_(node); - if (IS_REPEAT_INFINITE(qn->upper) || qn->upper >= 2) + if (IS_INFINITE_REPEAT(qn->upper) || qn->upper >= 2) state |= IN_REAL_REPEAT; if (qn->lower != qn->upper) state |= IN_VAR_REPEAT; @@ -4468,7 +4497,7 @@ setup_called_state(Node* node, int state) { QuantNode* qn = QUANT_(node); - if (IS_REPEAT_INFINITE(qn->upper) || qn->upper >= 2) + if (IS_INFINITE_REPEAT(qn->upper) || qn->upper >= 2) state |= IN_REAL_REPEAT; if (qn->lower != qn->upper) state |= IN_VAR_REPEAT; @@ -4600,24 +4629,24 @@ setup_quant(Node* node, regex_t* reg, int state, ScanEnv* env) NODE_STATUS_ADD(node, IN_MULTI_ENTRY); } - if (IS_REPEAT_INFINITE(qn->upper) || qn->upper >= 1) { + if (IS_INFINITE_REPEAT(qn->upper) || qn->upper >= 1) { d = tree_min_len(body, env); if (d == 0) { -#ifdef USE_INSISTENT_CHECK_CAPTURES_IN_EMPTY_REPEAT - qn->empty_info = quantifiers_memory_node_info(body); - if (qn->empty_info == BODY_IS_EMPTY_REC) { +#ifdef USE_STUBBORN_CHECK_CAPTURES_IN_EMPTY_REPEAT + qn->emptiness = quantifiers_memory_node_info(body); + if (qn->emptiness == BODY_IS_EMPTY_POSSIBILITY_REC) { if (NODE_TYPE(body) == NODE_BAG && BAG_(body)->type == BAG_MEMORY) { MEM_STATUS_ON(env->bt_mem_end, BAG_(body)->m.regnum); } } #else - qn->empty_info = BODY_IS_EMPTY; + qn->emptiness = BODY_IS_EMPTY_POSSIBILITY; #endif } } - if (IS_REPEAT_INFINITE(qn->upper) || qn->upper >= 2) + if (IS_INFINITE_REPEAT(qn->upper) || qn->upper >= 2) state |= IN_REAL_REPEAT; if (qn->lower != qn->upper) state |= IN_VAR_REPEAT; @@ -4628,7 +4657,7 @@ setup_quant(Node* node, regex_t* reg, int state, ScanEnv* env) /* expand string */ #define EXPAND_STRING_MAX_LENGTH 100 if (NODE_TYPE(body) == NODE_STRING) { - if (!IS_REPEAT_INFINITE(qn->lower) && qn->lower == qn->upper && + if (!IS_INFINITE_REPEAT(qn->lower) && qn->lower == qn->upper && qn->lower > 1 && qn->lower <= EXPAND_STRING_MAX_LENGTH) { int len = NODE_STRING_LEN(body); StrNode* sn = STR_(body); @@ -4646,7 +4675,7 @@ setup_quant(Node* node, regex_t* reg, int state, ScanEnv* env) } } - if (qn->greedy && (qn->empty_info == BODY_IS_NOT_EMPTY)) { + if (qn->greedy && (qn->emptiness == BODY_IS_NOT_EMPTY)) { if (NODE_TYPE(body) == NODE_QUANT) { QuantNode* tqn = QUANT_(body); if (IS_NOT_NULL(tqn->head_exact)) { @@ -4663,7 +4692,7 @@ setup_quant(Node* node, regex_t* reg, int state, ScanEnv* env) } /* setup_tree does the following work. - 1. check empty loop. (set qn->empty_info) + 1. check empty loop. (set qn->emptiness) 2. expand ignore-case in char class. 3. set memory status bit flags. (reg->mem_stats) 4. set qn->head_exact for [push, exact] -> [push_or_jump_exact1, exact]. @@ -4752,10 +4781,10 @@ setup_tree(Node* node, regex_t* reg, int state, ScanEnv* env) r = setup_tree(target, reg, state, env); if (NODE_TYPE(target) == NODE_QUANT) { QuantNode* tqn = QUANT_(target); - if (IS_REPEAT_INFINITE(tqn->upper) && tqn->lower <= 1 && + if (IS_INFINITE_REPEAT(tqn->upper) && tqn->lower <= 1 && tqn->greedy != 0) { /* (?>a*), a*+ etc... */ - if (NODE_IS_SIMPLE_TYPE(NODE_BODY(target))) - NODE_STATUS_ADD(node, STOP_BT_SIMPLE_REPEAT); + if (is_strict_real_node(NODE_BODY(target))) + NODE_STATUS_ADD(node, STRICT_REAL_REPEAT); } } } @@ -5752,7 +5781,7 @@ optimize_nodes(Node* node, OptNode* opt, OptEnv* env) opt->sm.reach_end = 0; } - if (IS_REPEAT_INFINITE(qn->upper)) { + if (IS_INFINITE_REPEAT(qn->upper)) { if (env->mmd.max == 0 && NODE_IS_ANYCHAR(NODE_BODY(node)) && qn->greedy != 0) { if (IS_MULTILINE(CTYPE_OPTION(NODE_QUANT_BODY(qn), env))) @@ -6672,6 +6701,7 @@ onig_is_code_in_cc(OnigEncoding enc, OnigCodePoint code, CClassNode* cc) } else { len = ONIGENC_CODE_TO_MBCLEN(enc, code); + if (len < 0) return 0; } return onig_is_code_in_cc_len(len, code, cc); } diff --git a/src/regenc.c b/src/regenc.c index 6376565..9fab721 100644 --- a/src/regenc.c +++ b/src/regenc.c @@ -853,6 +853,8 @@ onigenc_mbn_is_mbc_ambiguous(OnigEncoding enc, OnigCaseFoldType flag, extern int onigenc_mb2_code_to_mbclen(OnigCodePoint code) { + if ((code & (~0xffff)) != 0) return ONIGERR_INVALID_CODE_POINT_VALUE; + if ((code & 0xff00) != 0) return 2; else return 1; } diff --git a/src/regerror.c b/src/regerror.c index 7564827..e6d1806 100644 --- a/src/regerror.c +++ b/src/regerror.c @@ -257,6 +257,23 @@ static int to_ascii(OnigEncoding enc, UChar *s, UChar *end, } +extern int +onig_is_error_code_needs_param(int code) +{ + switch (code) { + case ONIGERR_UNDEFINED_NAME_REFERENCE: + case ONIGERR_UNDEFINED_GROUP_REFERENCE: + case ONIGERR_MULTIPLEX_DEFINED_NAME: + case ONIGERR_MULTIPLEX_DEFINITION_NAME_CALL: + case ONIGERR_INVALID_GROUP_NAME: + case ONIGERR_INVALID_CHAR_IN_GROUP_NAME: + case ONIGERR_INVALID_CHAR_PROPERTY_NAME: + return 1; + default: + return 0; + } +} + /* for ONIG_MAX_ERROR_MESSAGE_LEN */ #define MAX_ERROR_PAR_LEN 30 diff --git a/src/regexec.c b/src/regexec.c index 6618996..f957b75 100644 --- a/src/regexec.c +++ b/src/regexec.c @@ -980,6 +980,8 @@ onig_region_copy(OnigRegion* to, OnigRegion* from) #define STK_CALL_FRAME 0x0400 #define STK_RETURN 0x0500 #define STK_SAVE_VAL 0x0600 +#define STK_PREC_READ_START 0x0700 +#define STK_PREC_READ_END 0x0800 /* stack type check mask */ #define STK_MASK_POP_USED STK_ALT_FLAG @@ -1544,8 +1546,8 @@ stack_double(int is_alloca, char** arg_alloc_base, #define STACK_PUSH_ALT(pat,s,sprev) STACK_PUSH(STK_ALT,pat,s,sprev) #define STACK_PUSH_SUPER_ALT(pat,s,sprev) STACK_PUSH(STK_SUPER_ALT,pat,s,sprev) -#define STACK_PUSH_POS(s,sprev) \ - STACK_PUSH(STK_TO_VOID_START,(Operation* )0,s,sprev) +#define STACK_PUSH_PREC_READ_START(s,sprev) \ + STACK_PUSH(STK_PREC_READ_START,(Operation* )0,s,sprev) #define STACK_PUSH_ALT_PREC_READ_NOT(pat,s,sprev) \ STACK_PUSH(STK_ALT_PREC_READ_NOT,pat,s,sprev) #define STACK_PUSH_TO_VOID_START STACK_PUSH_TYPE(STK_TO_VOID_START) @@ -1887,6 +1889,27 @@ stack_double(int is_alloca, char** arg_alloc_base, }\ } while(0) +#define STACK_GET_PREC_READ_START(k) do {\ + int level = 0;\ + k = stk;\ + while (1) {\ + k--;\ + STACK_BASE_CHECK(k, "STACK_GET_PREC_READ_START");\ + if (IS_TO_VOID_TARGET(k)) {\ + k->type = STK_VOID;\ + }\ + else if (k->type == STK_PREC_READ_START) {\ + if (level == 0) {\ + break;\ + }\ + level--;\ + }\ + else if (k->type == STK_PREC_READ_END) {\ + level++;\ + }\ + }\ +} while(0) + #define STACK_EMPTY_CHECK(isnull,sid,s) do {\ StackType* k = stk;\ while (1) {\ @@ -1913,7 +1936,7 @@ stack_double(int is_alloca, char** arg_alloc_base, }\ } while (0) -#ifdef USE_INSISTENT_CHECK_CAPTURES_IN_EMPTY_REPEAT +#ifdef USE_STUBBORN_CHECK_CAPTURES_IN_EMPTY_REPEAT #define STACK_EMPTY_CHECK_MEM(isnull,sid,s,reg) do {\ StackType* k = stk;\ while (1) {\ @@ -1927,9 +1950,10 @@ stack_double(int is_alloca, char** arg_alloc_base, }\ else {\ UChar* endp;\ + int level = 0;\ (isnull) = 1;\ while (k < stk) {\ - if (k->type == STK_MEM_START) {\ + if (k->type == STK_MEM_START && level == 0) {\ STACK_MEM_START_GET_PREV_END_ADDR(k, reg, endp);\ if (endp == 0) {\ (isnull) = 0; break;\ @@ -1941,6 +1965,12 @@ stack_double(int is_alloca, char** arg_alloc_base, (isnull) = -1; /* empty, but position changed */ \ }\ }\ + else if (k->type == STK_PREC_READ_START) {\ + level++;\ + }\ + else if (k->type == STK_PREC_READ_END) {\ + level--;\ + }\ k++;\ }\ break;\ @@ -1965,10 +1995,11 @@ stack_double(int is_alloca, char** arg_alloc_base, }\ else {\ UChar* endp;\ + int prec_level = 0;\ (isnull) = 1;\ while (k < stk) {\ if (k->type == STK_MEM_START) {\ - if (level == 0) {\ + if (level == 0 && prec_level == 0) {\ STACK_MEM_START_GET_PREV_END_ADDR(k, reg, endp);\ if (endp == 0) {\ (isnull) = 0; break;\ @@ -1987,6 +2018,12 @@ stack_double(int is_alloca, char** arg_alloc_base, else if (k->type == STK_EMPTY_CHECK_END) {\ if (k->zid == (sid)) level--;\ }\ + else if (k->type == STK_PREC_READ_START) {\ + prec_level++;\ + }\ + else if (k->type == STK_PREC_READ_END) {\ + prec_level--;\ + }\ k++;\ }\ break;\ @@ -2023,7 +2060,7 @@ stack_double(int is_alloca, char** arg_alloc_base, }\ }\ } while(0) -#endif /* USE_INSISTENT_CHECK_CAPTURES_IN_EMPTY_REPEAT */ +#endif /* USE_STUBBORN_CHECK_CAPTURES_IN_EMPTY_REPEAT */ #define STACK_GET_REPEAT(sid, k) do {\ int level = 0;\ @@ -2968,6 +3005,7 @@ match_at(regex_t* reg, const UChar* str, const UChar* end, NEXT_OUT; CASE_OP(CCLASS_MB) + DATA_ENSURE(1); if (! ONIGENC_IS_MBC_HEAD(encode, s)) goto fail; cclass_mb: @@ -3441,11 +3479,13 @@ match_at(regex_t* reg, const UChar* str, const UChar* end, ? STACK_AT(mem_end_stk[mem])->u.mem.pstr : (UChar* )((void* )mem_end_stk[mem])); n = (int )(pend - pstart); - DATA_ENSURE(n); - sprev = s; - STRING_CMP(pstart, s, n); - while (sprev + (len = enclen(encode, sprev)) < s) - sprev += len; + if (n != 0) { + DATA_ENSURE(n); + sprev = s; + STRING_CMP(s, pstart, n); + while (sprev + (len = enclen(encode, sprev)) < s) + sprev += len; + } } INC_OP; JUMP_OUT; @@ -3468,11 +3508,13 @@ match_at(regex_t* reg, const UChar* str, const UChar* end, ? STACK_AT(mem_end_stk[mem])->u.mem.pstr : (UChar* )((void* )mem_end_stk[mem])); n = (int )(pend - pstart); - DATA_ENSURE(n); - sprev = s; - STRING_CMP_IC(case_fold_flag, pstart, &s, n); - while (sprev + (len = enclen(encode, sprev)) < s) - sprev += len; + if (n != 0) { + DATA_ENSURE(n); + sprev = s; + STRING_CMP_IC(case_fold_flag, pstart, &s, n); + while (sprev + (len = enclen(encode, sprev)) < s) + sprev += len; + } } INC_OP; JUMP_OUT; @@ -3498,15 +3540,16 @@ match_at(regex_t* reg, const UChar* str, const UChar* end, ? STACK_AT(mem_end_stk[mem])->u.mem.pstr : (UChar* )((void* )mem_end_stk[mem])); n = (int )(pend - pstart); - DATA_ENSURE(n); - sprev = s; - swork = s; - STRING_CMP_VALUE(pstart, swork, n, is_fail); - if (is_fail) continue; - s = swork; - while (sprev + (len = enclen(encode, sprev)) < s) - sprev += len; - + if (n != 0) { + DATA_ENSURE(n); + sprev = s; + swork = s; + STRING_CMP_VALUE(swork, pstart, n, is_fail); + if (is_fail) continue; + s = swork; + while (sprev + (len = enclen(encode, sprev)) < s) + sprev += len; + } break; /* success */ } if (i == tlen) goto fail; @@ -3535,15 +3578,16 @@ match_at(regex_t* reg, const UChar* str, const UChar* end, ? STACK_AT(mem_end_stk[mem])->u.mem.pstr : (UChar* )((void* )mem_end_stk[mem])); n = (int )(pend - pstart); - DATA_ENSURE(n); - sprev = s; - swork = s; - STRING_CMP_VALUE_IC(case_fold_flag, pstart, &swork, n, is_fail); - if (is_fail) continue; - s = swork; - while (sprev + (len = enclen(encode, sprev)) < s) - sprev += len; - + if (n != 0) { + DATA_ENSURE(n); + sprev = s; + swork = s; + STRING_CMP_VALUE_IC(case_fold_flag, pstart, &swork, n, is_fail); + if (is_fail) continue; + s = swork; + while (sprev + (len = enclen(encode, sprev)) < s) + sprev += len; + } break; /* success */ } if (i == tlen) goto fail; @@ -3560,6 +3604,7 @@ match_at(regex_t* reg, const UChar* str, const UChar* end, int len; int level; MemNumType* mems; + UChar* ssave; n = 0; backref_with_level: @@ -3567,10 +3612,11 @@ match_at(regex_t* reg, const UChar* str, const UChar* end, tlen = p->backref_general.num; mems = tlen == 1 ? &(p->backref_general.n1) : p->backref_general.ns; - sprev = s; + ssave = s; if (backref_match_at_nested_level(reg, stk, stk_base, n, case_fold_flag, level, (int )tlen, mems, &s, end)) { - if (sprev < end) { + if (ssave != s) { + sprev = ssave; while (sprev + (len = enclen(encode, sprev)) < s) sprev += len; } @@ -3658,7 +3704,7 @@ match_at(regex_t* reg, const UChar* str, const UChar* end, } JUMP_OUT; -#ifdef USE_INSISTENT_CHECK_CAPTURES_IN_EMPTY_REPEAT +#ifdef USE_STUBBORN_CHECK_CAPTURES_IN_EMPTY_REPEAT CASE_OP(EMPTY_CHECK_END_MEMST) { int is_empty; @@ -3683,7 +3729,7 @@ match_at(regex_t* reg, const UChar* str, const UChar* end, int is_empty; mem = p->empty_check_end.mem; /* mem: null check id */ -#ifdef USE_INSISTENT_CHECK_CAPTURES_IN_EMPTY_REPEAT +#ifdef USE_STUBBORN_CHECK_CAPTURES_IN_EMPTY_REPEAT STACK_EMPTY_CHECK_MEM_REC(is_empty, mem, s, reg); #else STACK_EMPTY_CHECK_REC(is_empty, mem, s); @@ -3851,14 +3897,15 @@ match_at(regex_t* reg, const UChar* str, const UChar* end, goto repeat_inc_ng; CASE_OP(PREC_READ_START) - STACK_PUSH_POS(s, sprev); + STACK_PUSH_PREC_READ_START(s, sprev); INC_OP; JUMP_OUT; CASE_OP(PREC_READ_END) - STACK_EXEC_TO_VOID(stkp); + STACK_GET_PREC_READ_START(stkp); s = stkp->u.state.pstr; sprev = stkp->u.state.pstr_prev; + STACK_PUSH(STK_PREC_READ_END,0,0,0); INC_OP; JUMP_OUT; @@ -5443,6 +5490,9 @@ onig_builtin_error(OnigCalloutArgs* args, void* user_data ARG_UNUSED) if (n >= 0) { n = ONIGERR_INVALID_CALLOUT_BODY; } + else if (onig_is_error_code_needs_param(n)) { + n = ONIGERR_INVALID_CALLOUT_BODY; + } return n; } diff --git a/src/regext.c b/src/regext.c index fa4b360..965c793 100644 --- a/src/regext.c +++ b/src/regext.c @@ -29,6 +29,7 @@ #include "regint.h" +#if 0 static void conv_ext0be32(const UChar* s, const UChar* end, UChar* conv) { @@ -158,6 +159,7 @@ conv_encoding(OnigEncoding from, OnigEncoding to, const UChar* s, const UChar* e return ONIGERR_NOT_SUPPORTED_ENCODING_COMBINATION; } +#endif extern int onig_new_deluxe(regex_t** reg, const UChar* pattern, const UChar* pattern_end, @@ -169,9 +171,7 @@ onig_new_deluxe(regex_t** reg, const UChar* pattern, const UChar* pattern_end, if (IS_NOT_NULL(einfo)) einfo->par = (UChar* )NULL; if (ci->pattern_enc != ci->target_enc) { - r = conv_encoding(ci->pattern_enc, ci->target_enc, pattern, pattern_end, - &cpat, &cpat_end); - if (r != 0) return r; + return ONIGERR_NOT_SUPPORTED_ENCODING_COMBINATION; } else { cpat = (UChar* )pattern; diff --git a/src/regint.h b/src/regint.h index 56767e8..38389a1 100644 --- a/src/regint.h +++ b/src/regint.h @@ -63,7 +63,7 @@ #define USE_CALL #define USE_CALLOUT #define USE_BACKREF_WITH_LEVEL /* \k, \k */ -#define USE_INSISTENT_CHECK_CAPTURES_IN_EMPTY_REPEAT /* /(?:()|())*\2/ */ +#define USE_STUBBORN_CHECK_CAPTURES_IN_EMPTY_REPEAT /* /(?:()|())*\2/ */ #define USE_NEWLINE_AT_END_OF_STRING_HAS_EMPTY_LINE /* /\n$/ =~ "\n" */ #define USE_WARNING_REDUNDANT_NESTED_REPEAT_OPERATOR #define USE_RETRY_LIMIT_IN_MATCH @@ -348,8 +348,8 @@ typedef unsigned int MemStatusType; #define DISABLE_CASE_FOLD_MULTI_CHAR(case_fold_flag) \ ((case_fold_flag) & ~INTERNAL_ONIGENC_CASE_FOLD_MULTI_CHAR) -#define REPEAT_INFINITE -1 -#define IS_REPEAT_INFINITE(n) ((n) == REPEAT_INFINITE) +#define INFINITE_REPEAT -1 +#define IS_INFINITE_REPEAT(n) ((n) == INFINITE_REPEAT) /* bitset */ #define BITS_PER_BYTE 8 diff --git a/src/regparse.c b/src/regparse.c index f1deea3..7f8b1a9 100644 --- a/src/regparse.c +++ b/src/regparse.c @@ -77,6 +77,7 @@ OnigSyntaxType OnigSyntaxOniguruma = { ONIG_SYN_CAPTURE_ONLY_NAMED_GROUP | ONIG_SYN_ALLOW_MULTIPLEX_DEFINITION_NAME | ONIG_SYN_FIXED_INTERVAL_IS_GREEDY_ONLY | + ONIG_SYN_ALLOW_INVALID_CODE_END_OF_RANGE_IN_CC | ONIG_SYN_WARN_CC_OP_NOT_ESCAPED | ONIG_SYN_WARN_REDUNDANT_NESTED_REPEAT ) , ONIG_OPTION_NONE @@ -1093,6 +1094,35 @@ onig_name_to_group_numbers(regex_t* reg, const UChar* name, return e->back_num; } +static int +name_to_group_numbers(ScanEnv* env, const UChar* name, const UChar* name_end, + int** nums) +{ + regex_t* reg; + NameEntry* e; + + reg = env->reg; + e = name_find(reg, name, name_end); + + if (IS_NULL(e)) { + onig_scan_env_set_error_string(env, ONIGERR_UNDEFINED_NAME_REFERENCE, + (UChar* )name, (UChar* )name_end); + return ONIGERR_UNDEFINED_NAME_REFERENCE; + } + + switch (e->back_num) { + case 0: + break; + case 1: + *nums = &(e->back_ref1); + break; + default: + *nums = e->back_refs; + break; + } + return e->back_num; +} + extern int onig_name_to_backref_number(regex_t* reg, const UChar* name, const UChar* name_end, OnigRegion *region) @@ -1869,8 +1899,8 @@ callout_tag_table_new(CalloutTagTable** rt) } static int -callout_tag_entry_raw(CalloutTagTable* t, UChar* name, UChar* name_end, - CalloutTagVal entry_val) +callout_tag_entry_raw(ScanEnv* env, CalloutTagTable* t, UChar* name, + UChar* name_end, CalloutTagVal entry_val) { int r; CalloutTagVal val; @@ -1879,8 +1909,11 @@ callout_tag_entry_raw(CalloutTagTable* t, UChar* name, UChar* name_end, return ONIGERR_INVALID_CALLOUT_TAG_NAME; val = callout_tag_find(t, name, name_end); - if (val >= 0) + if (val >= 0) { + onig_scan_env_set_error_string(env, ONIGERR_MULTIPLEX_DEFINED_NAME, + name, name_end); return ONIGERR_MULTIPLEX_DEFINED_NAME; + } r = onig_st_insert_strend(t, name, name_end, (HashDataType )entry_val); if (r < 0) return r; @@ -1909,7 +1942,7 @@ ext_ensure_tag_table(regex_t* reg) } static int -callout_tag_entry(regex_t* reg, UChar* name, UChar* name_end, +callout_tag_entry(ScanEnv* env, regex_t* reg, UChar* name, UChar* name_end, CalloutTagVal entry_val) { int r; @@ -1921,7 +1954,7 @@ callout_tag_entry(regex_t* reg, UChar* name, UChar* name_end, ext = onig_get_regex_ext(reg); CHECK_NULL_RETURN_MEMERR(ext); - r = callout_tag_entry_raw(ext->tag_table, name, name_end, entry_val); + r = callout_tag_entry_raw(env, ext->tag_table, name, name_end, entry_val); e = onig_reg_callout_list_at(reg, (int )entry_val); CHECK_NULL_RETURN_MEMERR(e); @@ -2391,10 +2424,10 @@ node_new_quantifier(int lower, int upper, int by_number) CHECK_NULL_RETURN(node); NODE_SET_TYPE(node, NODE_QUANT); - QUANT_(node)->lower = lower; - QUANT_(node)->upper = upper; - QUANT_(node)->greedy = 1; - QUANT_(node)->empty_info = BODY_IS_NOT_EMPTY; + QUANT_(node)->lower = lower; + QUANT_(node)->upper = upper; + QUANT_(node)->greedy = 1; + QUANT_(node)->emptiness = BODY_IS_NOT_EMPTY; QUANT_(node)->head_exact = NULL_NODE; QUANT_(node)->next_head_exact = NULL_NODE; QUANT_(node)->is_refered = 0; @@ -2694,7 +2727,7 @@ make_text_segment(Node** node, ScanEnv* env) ns[0] = x; ns[1] = NULL_NODE; - x = node_new_quantifier(0, REPEAT_INFINITE, 1); + x = node_new_quantifier(0, INFINITE_REPEAT, 1); if (IS_NULL(x)) goto err; NODE_BODY(x) = ns[0]; @@ -3044,7 +3077,7 @@ make_absent_tree(Node** node, Node* absent, Node* expr, int is_range_cutter, if (expr == NULL_NODE) { /* default expr \O* */ - quant = node_new_quantifier(0, REPEAT_INFINITE, 0); + quant = node_new_quantifier(0, INFINITE_REPEAT, 0); if (IS_NULL(quant)) goto err0; r = node_new_true_anychar(&body, env); @@ -3086,7 +3119,7 @@ make_absent_tree(Node** node, Node* absent, Node* expr, int is_range_cutter, if (r != 0) goto err; possessive = 1; - r = make_absent_engine(&ns[2], id1, absent, ns[3], 0, REPEAT_INFINITE, + r = make_absent_engine(&ns[2], id1, absent, ns[3], 0, INFINITE_REPEAT, possessive, is_range_cutter, env); if (r != 0) goto err; @@ -3236,10 +3269,18 @@ node_new_empty(void) static Node* node_new_str_raw_char(UChar c) { + int i; UChar p[1]; + Node* node; p[0] = c; - return node_new_str_raw(p, p + 1); + node = node_new_str_raw(p, p + 1); + + /* clear buf tail */ + for (i = 1; i < NODE_STRING_BUF_SIZE; i++) + STR_(node)->buf[i] = '\0'; + + return node; } static Node* @@ -3275,24 +3316,6 @@ str_node_can_be_split(Node* node, OnigEncoding enc) return 0; } -#ifdef USE_PAD_TO_SHORT_BYTE_CHAR -static int -node_str_head_pad(StrNode* sn, int num, UChar val) -{ - UChar buf[NODE_STRING_BUF_SIZE]; - int i, len; - - len = sn->end - sn->s; - onig_strcpy(buf, sn->s, sn->end); - onig_strcpy(&(sn->s[num]), buf, buf + len); - sn->end += num; - - for (i = 0; i < num; i++) { - sn->s[i] = val; - } -} -#endif - extern int onig_scan_unsigned_number(UChar** src, const UChar* end, OnigEncoding enc) { @@ -3877,19 +3900,19 @@ quantifier_type_num(QuantNode* q) if (q->greedy) { if (q->lower == 0) { if (q->upper == 1) return 0; - else if (IS_REPEAT_INFINITE(q->upper)) return 1; + else if (IS_INFINITE_REPEAT(q->upper)) return 1; } else if (q->lower == 1) { - if (IS_REPEAT_INFINITE(q->upper)) return 2; + if (IS_INFINITE_REPEAT(q->upper)) return 2; } } else { if (q->lower == 0) { if (q->upper == 1) return 3; - else if (IS_REPEAT_INFINITE(q->upper)) return 4; + else if (IS_INFINITE_REPEAT(q->upper)) return 4; } else if (q->lower == 1) { - if (IS_REPEAT_INFINITE(q->upper)) return 5; + if (IS_INFINITE_REPEAT(q->upper)) return 5; } } return -1; @@ -3926,8 +3949,8 @@ onig_reduce_nested_quantifier(Node* pnode, Node* cnode) pnum = quantifier_type_num(p); cnum = quantifier_type_num(c); if (pnum < 0 || cnum < 0) { - if ((p->lower == p->upper) && ! IS_REPEAT_INFINITE(p->upper)) { - if ((c->lower == c->upper) && ! IS_REPEAT_INFINITE(c->upper)) { + if ((p->lower == p->upper) && ! IS_INFINITE_REPEAT(p->upper)) { + if ((c->lower == c->upper) && ! IS_INFINITE_REPEAT(c->upper)) { int n = onig_positive_int_multiply(p->lower, c->lower); if (n >= 0) { p->lower = p->upper = n; @@ -3946,11 +3969,11 @@ onig_reduce_nested_quantifier(Node* pnode, Node* cnode) break; case RQ_A: NODE_BODY(pnode) = NODE_BODY(cnode); - p->lower = 0; p->upper = REPEAT_INFINITE; p->greedy = 1; + p->lower = 0; p->upper = INFINITE_REPEAT; p->greedy = 1; break; case RQ_AQ: NODE_BODY(pnode) = NODE_BODY(cnode); - p->lower = 0; p->upper = REPEAT_INFINITE; p->greedy = 0; + p->lower = 0; p->upper = INFINITE_REPEAT; p->greedy = 0; break; case RQ_QQ: NODE_BODY(pnode) = NODE_BODY(cnode); @@ -3959,13 +3982,13 @@ onig_reduce_nested_quantifier(Node* pnode, Node* cnode) case RQ_P_QQ: NODE_BODY(pnode) = cnode; p->lower = 0; p->upper = 1; p->greedy = 0; - c->lower = 1; c->upper = REPEAT_INFINITE; c->greedy = 1; + c->lower = 1; c->upper = INFINITE_REPEAT; c->greedy = 1; return ; break; case RQ_PQ_Q: NODE_BODY(pnode) = cnode; p->lower = 0; p->upper = 1; p->greedy = 1; - c->lower = 1; c->upper = REPEAT_INFINITE; c->greedy = 0; + c->lower = 1; c->upper = INFINITE_REPEAT; c->greedy = 0; return ; break; case RQ_ASIS: @@ -4158,7 +4181,7 @@ fetch_interval_quantifier(UChar** src, UChar* end, PToken* tok, ScanEnv* env) if (p == prev) { if (non_low != 0) goto invalid; - up = REPEAT_INFINITE; /* {n,} : {n,infinite} */ + up = INFINITE_REPEAT; /* {n,} : {n,infinite} */ } } else { @@ -4178,7 +4201,7 @@ fetch_interval_quantifier(UChar** src, UChar* end, PToken* tok, ScanEnv* env) } if (c != '}') goto invalid; - if (!IS_REPEAT_INFINITE(up) && low > up) { + if (!IS_INFINITE_REPEAT(up) && low > up) { /* {n,m}+ supported case */ if (IS_SYNTAX_OP2(env->syntax, ONIG_SYN_OP2_PLUS_POSSESSIVE_INTERVAL)) return ONIGERR_UPPER_SMALLER_THAN_LOWER_IN_REPEAT_RANGE; @@ -4959,7 +4982,7 @@ fetch_token(PToken* tok, UChar** src, UChar* end, ScanEnv* env) if (! IS_SYNTAX_OP(syn, ONIG_SYN_OP_ESC_ASTERISK_ZERO_INF)) break; tok->type = TK_REPEAT; tok->u.repeat.lower = 0; - tok->u.repeat.upper = REPEAT_INFINITE; + tok->u.repeat.upper = INFINITE_REPEAT; goto greedy_check; break; @@ -4967,7 +4990,7 @@ fetch_token(PToken* tok, UChar** src, UChar* end, ScanEnv* env) if (! IS_SYNTAX_OP(syn, ONIG_SYN_OP_ESC_PLUS_ONE_INF)) break; tok->type = TK_REPEAT; tok->u.repeat.lower = 1; - tok->u.repeat.upper = REPEAT_INFINITE; + tok->u.repeat.upper = INFINITE_REPEAT; goto greedy_check; break; @@ -5358,10 +5381,8 @@ fetch_token(PToken* tok, UChar** src, UChar* end, ScanEnv* env) tok->u.backref.ref1 = back_num; } else { - num = onig_name_to_group_numbers(env->reg, prev, name_end, &backs); + num = name_to_group_numbers(env, prev, name_end, &backs); if (num <= 0) { - onig_scan_env_set_error_string(env, - ONIGERR_UNDEFINED_NAME_REFERENCE, prev, name_end); return ONIGERR_UNDEFINED_NAME_REFERENCE; } if (IS_SYNTAX_BV(syn, ONIG_SYN_STRICT_CHECK_BACKREF)) { @@ -5514,7 +5535,7 @@ fetch_token(PToken* tok, UChar** src, UChar* end, ScanEnv* env) #endif tok->type = TK_REPEAT; tok->u.repeat.lower = 0; - tok->u.repeat.upper = REPEAT_INFINITE; + tok->u.repeat.upper = INFINITE_REPEAT; goto greedy_check; break; @@ -5525,7 +5546,7 @@ fetch_token(PToken* tok, UChar** src, UChar* end, ScanEnv* env) #endif tok->type = TK_REPEAT; tok->u.repeat.lower = 1; - tok->u.repeat.upper = REPEAT_INFINITE; + tok->u.repeat.upper = INFINITE_REPEAT; goto greedy_check; break; @@ -5608,7 +5629,7 @@ fetch_token(PToken* tok, UChar** src, UChar* end, ScanEnv* env) tok->u.call.gnum = 0; tok->u.call.name = p; PINC; - if (! PPEEK_IS(')')) return ONIGERR_INVALID_GROUP_NAME; + if (! PPEEK_IS(')')) return ONIGERR_UNDEFINED_GROUP_OPTION; tok->u.call.name_end = p; break; @@ -6249,6 +6270,7 @@ parse_char_class(Node** np, PToken* tok, UChar** src, UChar* end, ScanEnv* env) env->parse_depth++; if (env->parse_depth > ParseDepthLimit) return ONIGERR_PARSE_DEPTH_LIMIT_OVER; + prev_cc = (CClassNode* )NULL; r = fetch_token_in_cc(tok, src, end, env); if (r == TK_CHAR && tok->u.c == '^' && tok->escaped == 0) { @@ -6301,10 +6323,11 @@ parse_char_class(Node** np, PToken* tok, UChar** src, UChar* end, ScanEnv* env) case TK_RAW_BYTE: /* tok->base != 0 : octal or hexadec. */ if (! ONIGENC_IS_SINGLEBYTE(env->enc) && tok->base != 0) { + int i, j; UChar buf[ONIGENC_CODE_TO_MBC_MAXLEN]; UChar* bufe = buf + ONIGENC_CODE_TO_MBC_MAXLEN; UChar* psave = p; - int i, base = tok->base; + int base = tok->base; buf[0] = tok->u.c; for (i = 1; i < ONIGENC_MBC_MAXLEN(env->enc); i++) { @@ -6322,6 +6345,9 @@ parse_char_class(Node** np, PToken* tok, UChar** src, UChar* end, ScanEnv* env) goto err; } + /* clear buf tail */ + for (j = i; j < ONIGENC_CODE_TO_MBC_MAXLEN; j++) buf[j] = '\0'; + len = enclen(env->enc, buf); if (i < len) { r = ONIGERR_TOO_SHORT_MULTI_BYTE_STRING; @@ -6359,8 +6385,13 @@ parse_char_class(Node** np, PToken* tok, UChar** src, UChar* end, ScanEnv* env) val_entry: len = ONIGENC_CODE_TO_MBCLEN(env->enc, v); if (len < 0) { - r = len; - goto err; + if (state != CCS_RANGE || + ! IS_SYNTAX_BV(env->syntax, + ONIG_SYN_ALLOW_INVALID_CODE_END_OF_RANGE_IN_CC) || + v < 0x100 || ONIGENC_MBC_MAXLEN(env->enc) == 1) { + r = len; + goto err; + } } in_type = (len == 1 ? CCV_SB : CCV_CODE_POINT); val_entry2: @@ -6673,7 +6704,7 @@ parse_callout_of_contents(Node** np, int cterm, UChar** src, UChar* end, ScanEnv } if (tag_start != tag_end) { - r = callout_tag_entry(env->reg, tag_start, tag_end, num); + r = callout_tag_entry(env, env->reg, tag_start, tag_end, num); if (r != ONIG_NORMAL) return r; } @@ -6994,7 +7025,7 @@ parse_callout_of_name(Node** np, int cterm, UChar** src, UChar* end, ScanEnv* en } if (tag_start != tag_end) { - r = callout_tag_entry(env->reg, tag_start, tag_end, num); + r = callout_tag_entry(env, env->reg, tag_start, tag_end, num); if (r != ONIG_NORMAL) return r; } @@ -7271,10 +7302,8 @@ parse_bag(Node** np, PToken* tok, int term, UChar** src, UChar* end, int num; int* backs; - num = onig_name_to_group_numbers(env->reg, prev, name_end, &backs); + num = name_to_group_numbers(env, prev, name_end, &backs); if (num <= 0) { - onig_scan_env_set_error_string(env, - ONIGERR_UNDEFINED_NAME_REFERENCE, prev, name_end); return ONIGERR_UNDEFINED_NAME_REFERENCE; } if (IS_SYNTAX_BV(env->syntax, ONIG_SYN_STRICT_CHECK_BACKREF)) { @@ -7414,6 +7443,7 @@ parse_bag(Node** np, PToken* tok, int term, UChar** src, UChar* end, } break; +#ifdef USE_CAPTURE_HISTORY case '@': if (IS_SYNTAX_OP2(env->syntax, ONIG_SYN_OP2_ATMARK_CAPTURE_HISTORY)) { if (IS_SYNTAX_OP2(env->syntax, ONIG_SYN_OP2_QMARK_LT_NAMED_GROUP)) { @@ -7441,6 +7471,7 @@ parse_bag(Node** np, PToken* tok, int term, UChar** src, UChar* end, return ONIGERR_UNDEFINED_GROUP_OPTION; } break; +#endif #ifdef USE_POSIXLINE_OPTION case 'p': @@ -7688,7 +7719,7 @@ set_quantifier(Node* qnode, Node* target, int group, ScanEnv* env) if (targetq_num >= 0 && nestq_num < 0) { if (targetq_num == 1 || targetq_num == 2) { /* * or + */ /* (?:a*){n,m}, (?:a+){n,m} => (?:a*){n,n}, (?:a+){n,n} */ - if (! IS_REPEAT_INFINITE(qn->upper) && qn->upper > 1 && qn->greedy) { + if (! IS_INFINITE_REPEAT(qn->upper) && qn->upper > 1 && qn->greedy) { qn->upper = (qn->lower == 0 ? 1 : qn->lower); } } @@ -7826,14 +7857,18 @@ static int parse_exp(Node** np, PToken* tok, int term, UChar** src, UChar* end, ScanEnv* env, int group_head) { - int r, len, group = 0; + int r, len, group; Node* qn; Node** tp; + unsigned int parse_depth; + group = 0; *np = NULL; if (tok->type == (enum TokenSyms )term) goto end_of_token; + parse_depth = env->parse_depth; + switch (tok->type) { case TK_ALT: case TK_EOT: @@ -7914,36 +7949,29 @@ parse_exp(Node** np, PToken* tok, int term, UChar** src, UChar* end, len = 1; while (1) { if (len >= ONIGENC_MBC_MINLEN(env->enc)) { - if (len == enclen(env->enc, STR_(*np)->s)) {/* should not enclen_end() */ + if (len == enclen(env->enc, STR_(*np)->s)) { r = fetch_token(tok, src, end, env); - NODE_STRING_CLEAR_RAW(*np); - goto string_end; + goto tk_raw_byte_end; } } r = fetch_token(tok, src, end, env); if (r < 0) return r; - if (r != TK_RAW_BYTE) { - /* Don't use this, it is wrong for little endian encodings. */ -#ifdef USE_PAD_TO_SHORT_BYTE_CHAR - int rem; - if (len < ONIGENC_MBC_MINLEN(env->enc)) { - rem = ONIGENC_MBC_MINLEN(env->enc) - len; - (void )node_str_head_pad(STR_(*np), rem, (UChar )0); - if (len + rem == enclen(env->enc, STR_(*np)->s)) { - NODE_STRING_CLEAR_RAW(*np); - goto string_end; - } - } -#endif + if (r != TK_RAW_BYTE) return ONIGERR_TOO_SHORT_MULTI_BYTE_STRING; - } r = node_str_cat_char(*np, (UChar )tok->u.c); if (r < 0) return r; len++; } + + tk_raw_byte_end: + if (! ONIGENC_IS_VALID_MBC_STRING(env->enc, STR_(*np)->s, STR_(*np)->end)) + return ONIGERR_INVALID_WIDE_CHAR_VALUE; + + NODE_STRING_CLEAR_RAW(*np); + goto string_end; } break; @@ -8055,7 +8083,7 @@ parse_exp(Node** np, PToken* tok, int term, UChar** src, UChar* end, case TK_ANYCHAR_ANYTIME: *np = node_new_anychar(); CHECK_NULL_RETURN_MEMERR(*np); - qn = node_new_quantifier(0, REPEAT_INFINITE, 0); + qn = node_new_quantifier(0, INFINITE_REPEAT, 0); CHECK_NULL_RETURN_MEMERR(qn); NODE_BODY(qn) = *np; *np = qn; @@ -8158,6 +8186,10 @@ parse_exp(Node** np, PToken* tok, int term, UChar** src, UChar* end, if (is_invalid_quantifier_target(*tp)) return ONIGERR_TARGET_OF_REPEAT_OPERATOR_INVALID; + parse_depth++; + if (parse_depth > ParseDepthLimit) + return ONIGERR_PARSE_DEPTH_LIMIT_OVER; + qn = node_new_quantifier(tok->u.repeat.lower, tok->u.repeat.upper, r == TK_INTERVAL); CHECK_NULL_RETURN_MEMERR(qn); diff --git a/src/regparse.h b/src/regparse.h index b7a2867..231f7b5 100644 --- a/src/regparse.h +++ b/src/regparse.h @@ -66,11 +66,11 @@ enum GimmickType { #endif }; -enum BodyEmpty { - BODY_IS_NOT_EMPTY = 0, - BODY_IS_EMPTY = 1, - BODY_IS_EMPTY_MEM = 2, - BODY_IS_EMPTY_REC = 3 +enum BodyEmptyType { + BODY_IS_NOT_EMPTY = 0, + BODY_IS_EMPTY_POSSIBILITY = 1, + BODY_IS_EMPTY_POSSIBILITY_MEM = 2, + BODY_IS_EMPTY_POSSIBILITY_REC = 3 }; typedef struct { @@ -101,7 +101,7 @@ typedef struct { int lower; int upper; int greedy; - enum BodyEmpty empty_info; + enum BodyEmptyType emptiness; struct _Node* head_exact; struct _Node* next_head_exact; int is_refered; /* include called node. don't eliminate even if {0} */ @@ -252,10 +252,6 @@ typedef struct _Node { #define NODE_BIT_CALL NODE_TYPE2BIT(NODE_CALL) #define NODE_BIT_GIMMICK NODE_TYPE2BIT(NODE_GIMMICK) -#define NODE_IS_SIMPLE_TYPE(node) \ - ((NODE_TYPE2BIT(NODE_TYPE(node)) & \ - (NODE_BIT_STRING | NODE_BIT_CCLASS | NODE_BIT_CTYPE | NODE_BIT_BACKREF)) != 0) - #define NODE_TYPE(node) ((node)->u.base.node_type) #define NODE_SET_TYPE(node, ntype) (node)->u.base.node_type = (ntype) @@ -314,7 +310,7 @@ typedef struct _Node { #define NODE_ST_CLEN_FIXED (1<<2) #define NODE_ST_MARK1 (1<<3) #define NODE_ST_MARK2 (1<<4) -#define NODE_ST_STOP_BT_SIMPLE_REPEAT (1<<5) +#define NODE_ST_STRICT_REAL_REPEAT (1<<5) #define NODE_ST_RECURSION (1<<6) #define NODE_ST_CALLED (1<<7) #define NODE_ST_ADDR_FIXED (1<<8) @@ -357,8 +353,8 @@ typedef struct _Node { #define NODE_IS_SUPER(node) ((NODE_STATUS(node) & NODE_ST_SUPER) != 0) #define NODE_IS_PROHIBIT_RECURSION(node) \ ((NODE_STATUS(node) & NODE_ST_PROHIBIT_RECURSION) != 0) -#define NODE_IS_STOP_BT_SIMPLE_REPEAT(node) \ - ((NODE_STATUS(node) & NODE_ST_STOP_BT_SIMPLE_REPEAT) != 0) +#define NODE_IS_STRICT_REAL_REPEAT(node) \ + ((NODE_STATUS(node) & NODE_ST_STRICT_REAL_REPEAT) != 0) #define NODE_BODY(node) ((node)->u.base.body) #define NODE_QUANT_BODY(node) ((node)->body) diff --git a/src/utf16_be.c b/src/utf16_be.c index 22bf74d..b66d868 100644 --- a/src/utf16_be.c +++ b/src/utf16_be.c @@ -2,7 +2,7 @@ utf16_be.c - Oniguruma (regular expression library) **********************************************************************/ /*- - * Copyright (c) 2002-2018 K.Kosako + * Copyright (c) 2002-2019 K.Kosako * All rights reserved. * * Redistribution and use in source and binary forms, with or without @@ -103,7 +103,25 @@ utf16be_mbc_enc_len(const UChar* p) static int is_valid_mbc_string(const UChar* s, const UChar* end) { - return onigenc_length_check_is_valid_mbc_string(ONIG_ENCODING_UTF16_BE, s, end); + while (s < end) { + int len = utf16be_mbc_enc_len(s); + if (len == 4) { + if (s + 2 >= end) + return FALSE; + if (! UTF16_IS_SURROGATE_SECOND(*(s+2))) + return FALSE; + } + else + if (UTF16_IS_SURROGATE_SECOND(*s)) + return FALSE; + + s += len; + } + + if (s != end) + return FALSE; + else + return TRUE; } static int @@ -146,7 +164,15 @@ utf16be_mbc_to_code(const UChar* p, const UChar* end ARG_UNUSED) static int utf16be_code_to_mbclen(OnigCodePoint code) { - return (code > 0xffff ? 4 : 2); + if (code > 0xffff) { + if (code > 0x10ffff) + return ONIGERR_INVALID_CODE_POINT_VALUE; + else + return 4; + } + else { + return 2; + } } static int @@ -243,7 +269,8 @@ utf16be_left_adjust_char_head(const UChar* start, const UChar* s) s--; } - if (UTF16_IS_SURROGATE_SECOND(*s) && s > start + 1) + if (UTF16_IS_SURROGATE_SECOND(*s) && s > start + 1 && + UTF16_IS_SURROGATE_FIRST(*(s-2))) s -= 2; return (UChar* )s; diff --git a/src/utf16_le.c b/src/utf16_le.c index 4b231c6..cdc74b0 100644 --- a/src/utf16_le.c +++ b/src/utf16_le.c @@ -2,7 +2,7 @@ utf16_le.c - Oniguruma (regular expression library) **********************************************************************/ /*- - * Copyright (c) 2002-2018 K.Kosako + * Copyright (c) 2002-2019 K.Kosako * All rights reserved. * * Redistribution and use in source and binary forms, with or without @@ -95,7 +95,15 @@ static const int EncLen_UTF16[] = { static int utf16le_code_to_mbclen(OnigCodePoint code) { - return (code > 0xffff ? 4 : 2); + if (code > 0xffff) { + if (code > 0x10ffff) + return ONIGERR_INVALID_CODE_POINT_VALUE; + else + return 4; + } + else { + return 2; + } } static int @@ -110,7 +118,16 @@ is_valid_mbc_string(const UChar* p, const UChar* end) const UChar* end1 = end - 1; while (p < end1) { - p += utf16le_mbc_enc_len(p); + int len = utf16le_mbc_enc_len(p); + if (len == 4) { + if (p + 3 < end && ! UTF16_IS_SURROGATE_SECOND(*(p + 3))) + return FALSE; + } + else + if (UTF16_IS_SURROGATE_SECOND(*(p + 1))) + return FALSE; + + p += len; } if (p != end) @@ -252,7 +269,8 @@ utf16le_left_adjust_char_head(const UChar* start, const UChar* s) s--; } - if (UTF16_IS_SURROGATE_SECOND(*(s+1)) && s > start + 1) + if (UTF16_IS_SURROGATE_SECOND(*(s+1)) && s > start + 1 && + UTF16_IS_SURROGATE_FIRST(*(s-1))) s -= 2; return (UChar* )s; diff --git a/test/test_utf8.c b/test/test_utf8.c index bab6b0d..2338526 100644 --- a/test/test_utf8.c +++ b/test/test_utf8.c @@ -1202,10 +1202,23 @@ extern int main(int argc, char* argv[]) x2("a{3,2}b", "aab", 0, 3); x2("a{3,2}?", "", 0, 0); /* == (?:a{3,2})?*/ x2("a{2,3}+a", "aaa", 0, 3); /* == (?:a{2,3})+*/ + x2("[\\x{0}-\\x{7fffffff}]", "a", 0, 1); + x2("[\\x{7f}-\\x{7fffffff}]", "\xe5\xae\xb6", 0, 3); + + n(" \xfd", ""); /* https://bugs.php.net/bug.php?id=77370 */ + /* can't use \xfc00.. because compiler error: hex escape sequence out of range */ + n("()0\\xfc00000\\xfc00000\\xfc00000\xfc", ""); /* https://bugs.php.net/bug.php?id=77371 */ + x2("000||0\xfa", "0", 0, 0); /* https://bugs.php.net/bug.php?id=77381 */ + e("(?i)000000000000000000000\xf0", "", ONIGERR_INVALID_CODE_POINT_VALUE); /* https://bugs.php.net/bug.php?id=77382 */ + n("0000\\\xf5", "0"); /* https://bugs.php.net/bug.php?id=77385 */ + n("(?i)FFF00000000000000000\xfd", ""); /* https://bugs.php.net/bug.php?id=77394 */ + x2("\\p{Common}", "\xe3\x8b\xbf", 0, 3); /* U+32FF */ x2("\\p{In_Enclosed_CJK_Letters_and_Months}", "\xe3\x8b\xbf", 0, 3); /* U+32FF */ + e("\\x{7fffffff}", "", ONIGERR_TOO_BIG_WIDE_CHAR_VALUE); + e("[\\x{7fffffff}]", "", ONIGERR_INVALID_CODE_POINT_VALUE); e("\\u040", "@", ONIGERR_INVALID_CODE_POINT_VALUE); e("(?\\g)", "zzzz", ONIGERR_NEVER_ENDING_RECURSION); e("(?<=(?>abc))", "abc", ONIGERR_INVALID_LOOK_BEHIND_PATTERN); diff --git a/test/testu.c b/test/testu.c index 4b053e5..397da95 100644 --- a/test/testu.c +++ b/test/testu.c @@ -116,28 +116,13 @@ static void xx(char* pattern, char* str, int from, int to, int mem, int not) #else regex_t* reg; - OnigCompileInfo ci; OnigErrorInfo einfo; uconv(pattern, cpat, ulen(pattern)); uconv(str, cstr, ulen(str)); -#if 0 r = onig_new(®, (UChar* )pattern, (UChar* )(pattern + ulen(pattern)), ONIG_OPTION_DEFAULT, ENC, ONIG_SYNTAX_DEFAULT, &einfo); -#else - ci.num_of_elements = 5; - ci.pattern_enc = ENC; - ci.target_enc = ENC; - ci.syntax = ONIG_SYNTAX_DEFAULT; - ci.option = ONIG_OPTION_DEFAULT; - ci.case_fold_flag = ONIGENC_CASE_FOLD_DEFAULT; - - r = onig_new_deluxe(®, (UChar* )pattern, - (UChar* )(pattern + ulen(pattern)), - &ci, &einfo); -#endif - if (r) { char s[ONIG_MAX_ERROR_MESSAGE_LEN]; onig_error_code_to_str((UChar* )s, r, &einfo); -- cgit v1.2.3 From c3e46f9393d982d81ce46f63f7c7e368859bc4ae Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?J=C3=B6rg=20Frings-F=C3=BCrst?= Date: Wed, 7 Aug 2019 09:42:28 +0200 Subject: Remove upstream applied patches --- debian/changelog | 10 ++++++++++ debian/patches/series | 4 ++-- 2 files changed, 12 insertions(+), 2 deletions(-) diff --git a/debian/changelog b/debian/changelog index 2ae3106..da21bdf 100644 --- a/debian/changelog +++ b/debian/changelog @@ -1,3 +1,13 @@ +libonig (6.9.3-1) UNRELEASED; urgency=medium + + * Neu upstream release. + * debian/watch:_Correct typo. + * Remove upstream applied patches: + - 0105-CVE-2019-13224.patch + - 0110-CVE-2019-13225.patch + + -- Jörg Frings-Fürst Wed, 07 Aug 2019 09:33:40 +0200 + libonig (6.9.2-1) unstable; urgency=medium * New upstream release: diff --git a/debian/patches/series b/debian/patches/series index e924636..1c34712 100644 --- a/debian/patches/series +++ b/debian/patches/series @@ -1,3 +1,3 @@ 0100-source_typos.patch -0105-CVE-2019-13224.patch -0110-CVE-2019-13225.patch +#0105-CVE-2019-13224.patch +#0110-CVE-2019-13225.patch -- cgit v1.2.3 From 68d1ec60c90d27c511d51ce0bef44b132a7ddf11 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?J=C3=B6rg=20Frings-F=C3=BCrst?= Date: Wed, 7 Aug 2019 10:03:41 +0200 Subject: Refresh symbols file and add Build-Depends-Package field --- debian/changelog | 1 + debian/symbols | 2 ++ 2 files changed, 3 insertions(+) diff --git a/debian/changelog b/debian/changelog index da21bdf..0f1505f 100644 --- a/debian/changelog +++ b/debian/changelog @@ -1,6 +1,7 @@ libonig (6.9.3-1) UNRELEASED; urgency=medium * Neu upstream release. + - Refresh symbols file and add Build-Depends-Package field. * debian/watch:_Correct typo. * Remove upstream applied patches: - 0105-CVE-2019-13224.patch diff --git a/debian/symbols b/debian/symbols index 06628e7..89468b0 100644 --- a/debian/symbols +++ b/debian/symbols @@ -1,4 +1,5 @@ libonig.so.5 libonig5 #MINVER# +* Build-Depends-Package: libonig-dev OnigAsciiLowerMap@Base 6.8.1 OnigDefaultCaseFoldFlag@Base 6.8.1 OnigDefaultSyntax@Base 6.8.1 @@ -134,6 +135,7 @@ libonig.so.5 libonig5 #MINVER# onig_initialize_match_param@Base 6.8.1 onig_is_code_in_cc@Base 6.8.1 onig_is_code_in_cc_len@Base 6.8.1 + onig_is_error_code_needs_param@Base 6.9.3 onig_is_in_code_range@Base 6.8.1 onig_match@Base 6.8.1 onig_match_with_param@Base 6.8.1 -- cgit v1.2.3 From c527ea541a9633fb14391c981861e70070d9402f Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?J=C3=B6rg=20Frings-F=C3=BCrst?= Date: Fri, 29 Nov 2019 11:26:26 +0100 Subject: Change year to 2019 --- debian/changelog | 7 ++++--- debian/copyright | 4 ++-- 2 files changed, 6 insertions(+), 5 deletions(-) diff --git a/debian/changelog b/debian/changelog index 0f1505f..d88a462 100644 --- a/debian/changelog +++ b/debian/changelog @@ -2,10 +2,11 @@ libonig (6.9.3-1) UNRELEASED; urgency=medium * Neu upstream release. - Refresh symbols file and add Build-Depends-Package field. + - Remove upstream applied patches: + + 0105-CVE-2019-13224.patch + + 0110-CVE-2019-13225.patch + - Refresh debain/copyright. * debian/watch:_Correct typo. - * Remove upstream applied patches: - - 0105-CVE-2019-13224.patch - - 0110-CVE-2019-13225.patch -- Jörg Frings-Fürst Wed, 07 Aug 2019 09:33:40 +0200 diff --git a/debian/copyright b/debian/copyright index cd813f1..6b10c03 100644 --- a/debian/copyright +++ b/debian/copyright @@ -2,7 +2,7 @@ Format: https://www.debian.org/doc/packaging-manuals/copyright-format/1.0 Source: https://github.com/kkos/oniguruma Files: * -Copyright: 2002-2018 K.Kosako +Copyright: 2002-2019 K.Kosako License: BSD-2-clause License: BSD-2-clause @@ -30,7 +30,7 @@ License: BSD-2-clause Files: debian/* Copyright: 2006-2008 Max Kellermann - 2014-2018 Jörg Frings-Fürst + 2014-2019 Jörg Frings-Fürst License: GPL-2+ License: GPL-2+ -- cgit v1.2.3 From 4216de6a3336cbc6dddb572cb7e6ab6193bf3729 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?J=C3=B6rg=20Frings-F=C3=BCrst?= Date: Fri, 29 Nov 2019 11:26:35 +0100 Subject: New upstream version 6.9.4 --- .gitignore | 13 +- CMakeLists.txt | 6 +- HISTORY | 27 +- Makefile.am | 7 +- README.md | 40 +- build_harnesses.sh | 31 - configure.ac | 2 +- contributed/libfuzzer-onig.cpp | 45 - contributed/makefile | 21 - doc/API | 138 ++- doc/API.ja | 134 ++- doc/RE | 9 +- doc/RE.ja | 24 +- doc/SYNTAX.md | 9 +- doc/UNICODE_PROPERTIES | 2 +- harnesses/ascii_compatible.dict | 20 +- harnesses/deluxe-encode-harness.c | 39 +- harnesses/encode-harness.c | 291 +++++- harnesses/libfuzzer-onig.cpp | 45 + harnesses/makefile | 69 ++ harnesses/regset-harness.c | 379 ++++++++ harnesses/syntax-harness.c | 120 --- sample/Makefile.am | 10 +- sample/bug_fix.c | 2 +- sample/regset.c | 94 ++ src/Makefile.windows | 28 +- src/ascii.c | 2 +- src/big5.c | 23 +- src/config.h.win32 | 6 + src/config.h.win64 | 6 + src/config.h.windows.in | 7 + src/cp1251.c | 4 +- src/euc_jp.c | 21 +- src/euc_jp_prop.c | 2 +- src/euc_kr.c | 25 +- src/euc_tw.c | 18 +- src/gb18030.c | 28 +- src/gperf_fold_key_conv.py | 4 +- src/gperf_unfold_key_conv.py | 4 +- src/iso8859_1.c | 28 +- src/iso8859_10.c | 24 +- src/iso8859_11.c | 2 +- src/iso8859_13.c | 28 +- src/iso8859_14.c | 25 +- src/iso8859_15.c | 28 +- src/iso8859_16.c | 24 +- src/iso8859_2.c | 24 +- src/iso8859_3.c | 28 +- src/iso8859_4.c | 27 +- src/iso8859_5.c | 15 +- src/iso8859_6.c | 2 +- src/iso8859_7.c | 22 +- src/iso8859_8.c | 2 +- src/iso8859_9.c | 28 +- src/koi8.c | 21 +- src/koi8_r.c | 15 +- src/make_property.sh | 3 +- src/make_unicode_egcb_data.py | 23 +- src/make_unicode_fold.sh | 2 +- src/make_unicode_fold_data.py | 31 +- src/make_unicode_property.sh | 3 +- src/make_unicode_property_data.py | 77 +- src/make_unicode_wb_data.py | 21 +- src/mktable.c | 2 +- src/onig_init.c | 2 +- src/oniggnu.h | 2 +- src/onigposix.h | 6 +- src/oniguruma.h | 34 +- src/regcomp.c | 1861 ++++++++++++++++++++++-------------- src/regenc.c | 66 +- src/regenc.h | 4 +- src/regerror.c | 2 +- src/regexec.c | 1895 +++++++++++++++++++++++++------------ src/regext.c | 2 +- src/reggnu.c | 2 +- src/regint.h | 368 +++---- src/regparse.c | 968 ++++++++++--------- src/regparse.h | 80 +- src/regposerr.c | 2 +- src/regposix.c | 2 +- src/regsyntax.c | 2 +- src/regtrav.c | 2 +- src/regversion.c | 2 +- src/sjis.c | 31 +- src/sjis_prop.c | 2 +- src/unicode.c | 12 +- src/unicode_egcb_data.c | 4 +- src/unicode_fold1_key.c | 6 +- src/unicode_fold2_key.c | 6 +- src/unicode_fold3_key.c | 6 +- src/unicode_fold_data.c | 2 +- src/unicode_property_data.c | 5 +- src/unicode_property_data_posix.c | 2 +- src/unicode_unfold_key.c | 6 +- src/unicode_wb_data.c | 4 +- src/utf16_be.c | 35 +- src/utf16_le.c | 35 +- src/utf32_be.c | 35 +- src/utf32_le.c | 34 +- src/utf8.c | 29 +- test/Makefile.am | 11 +- test/test_regset.c | 465 +++++++++ test/test_utf8.c | 82 +- test/testc.c | 3 +- test/testu.c | 3 +- 105 files changed, 5189 insertions(+), 3191 deletions(-) delete mode 100755 build_harnesses.sh delete mode 100644 contributed/libfuzzer-onig.cpp delete mode 100644 contributed/makefile create mode 100644 harnesses/libfuzzer-onig.cpp create mode 100644 harnesses/makefile create mode 100644 harnesses/regset-harness.c delete mode 100644 harnesses/syntax-harness.c create mode 100644 sample/regset.c create mode 100644 test/test_regset.c diff --git a/.gitignore b/.gitignore index 227b7df..52d321a 100644 --- a/.gitignore +++ b/.gitignore @@ -10,6 +10,7 @@ onig-config libtool aclocal.m4 Makefile.in +.python-version *.o *.obj *.so @@ -35,7 +36,6 @@ m4/*.m4 /fuzzers # src/ -/src/CaseFolding.txt /src/unicode_fold?_key.gperf /src/unicode_unfold_key.gperf /src/UNICODE_PROPERTIES @@ -47,6 +47,8 @@ m4/*.m4 /test/testc /test/testcu /test/testp +/test/test_regset +/test/kofu-utf8.txt # sample/ /sample/crnl @@ -62,6 +64,15 @@ m4/*.m4 /sample/echo /sample/count /sample/bug_fix +/sample/regset /sample/log* /harnesses/utf16*.dict +/harnesses/*-libfuzzer +/harnesses/main-* +/harnesses/libfuzzer-onig +/harnesses/libfuzzer-onig-full +/harnesses/slow-unit-* +/harnesses/timeout-* +/harnesses/crash-* +/harnesses/oom-* diff --git a/CMakeLists.txt b/CMakeLists.txt index c59bfe3..bce888a 100644 --- a/CMakeLists.txt +++ b/CMakeLists.txt @@ -1,6 +1,6 @@ cmake_minimum_required(VERSION 3.1) project(oniguruma - VERSION 6.9.3 + VERSION 6.9.4 LANGUAGES C) set(PACKAGE onig) @@ -64,6 +64,9 @@ target_include_directories(onig PUBLIC $ $) +target_compile_definitions(onig PUBLIC + $<$>:ONIG_STATIC>) + if(MSVC) target_compile_options(onig PRIVATE #/W4 @@ -75,7 +78,6 @@ if(MSVC) $<$:/MT> $<$:/MTd> ) - target_compile_definitions(onig PUBLIC -DONIG_STATIC) endif() elseif(CMAKE_COMPILER_IS_GNUCC) target_compile_options(onig PRIVATE diff --git a/HISTORY b/HISTORY index 0380cb4..f4d4f67 100644 --- a/HISTORY +++ b/HISTORY @@ -1,8 +1,33 @@ History +2019/11/29: Version 6.9.4 + +2019/11/22: Release Candidate 3 for Version 6.9.4 + +2019/11/20: fix a problem found by libFuzzer test +2019/11/14: Release Candidate 2 for Version 6.9.4 +2019/11/12: fix integer overflow by nested quantifier +2019/11/11: fix CVE-2019-19012: Integer overflow related to reg->dmax in search_in_range() +2019/11/07: fix CVE-2019-19203: heap-buffer-overflow in gb18030_mbc_enc_len() +2019/11/06: fix CVE-2019-19204: heap-buffer-overflow in fetch_interval_quantifier() +2019/11/06: add HAVE_INTTYPES_H into config.h.windows.in and config.h.win{32,64} +2019/11/06: add HAVE_STDINT_H into config.h.win{32,64} +2019/11/05: Release Candidate 1 for Version 6.9.4 +2019/10/31: Update Unicode Emoji version to 12.1 (Nothing data changed) +2019/10/29: implement USE_REPEAT_AND_EMPTY_CHECK_LOCAL_VAR configuration +2019/10/18: re-implement case fold conversion +2019/10/04: fix #156: Heap buffer overflow in match_at() with case-insensitive match +2019/09/30: NEW API: add onig_regset_replace() +2019/09/30: change Unicode VERSION value format +2019/09/20: NEW API: add regset functions +2019/09/20: add data ensure check before peek string value in OP_PUSH_IF_PEEK_NEXT +2019/09/20: fix loose code in encode-harness.c +2019/08/13: fix heap-buffer-overflow +2019/08/13: Add a macro to disable direct threading in the match engine (PR#149) + 2019/08/06: Version 6.9.3 (secirity fix release) -2019/07/30: add ONIG_SYN_ALLOW_INVALID_CODE_END_OF_RANGE +2019/07/30: add ONIG_SYN_ALLOW_INVALID_CODE_END_OF_RANGE_IN_CC 2019/07/29: add STK_PREC_READ_START/END stack type 2019/07/29: Fix #147: Stack Exhaustion Problem caused by some parsing functions 2019/07/11: add a dictionary file for libfuzzer diff --git a/Makefile.am b/Makefile.am index a0bbc7b..ac5e27f 100644 --- a/Makefile.am +++ b/Makefile.am @@ -14,7 +14,7 @@ EXTRA_DIST = oniguruma.pc.in HISTORY README_japanese README.md \ doc/SYNTAX.md doc/UNICODE_PROPERTIES \ src/Makefile.windows src/config.h.windows.in \ src/config.h.win32 src/config.h.win64 \ - windows/testc.c contributed/libfuzzer-onig.cpp contributed/makefile + windows/testc.c bin_SCRIPTS = onig-config @@ -39,9 +39,12 @@ pkgconfig_DATA = oniguruma.pc all-test: cd test; make test +archive: + git archive --format=tar --prefix=oniguruma/ HEAD | gzip > ../oniguruma-archive.tar.gz + sanitize: make clean - ./configure CC=clang CFLAGS="-O -g -fsanitize=address" + ./configure --enable-posix-api=no CC=clang CFLAGS="-O -g -fsanitize=address" LDFLAGS="-fsanitize=address" make make all-test diff --git a/README.md b/README.md index 6a4783b..eb215de 100644 --- a/README.md +++ b/README.md @@ -27,25 +27,34 @@ Supported character encodings: * doc/SYNTAX.md: contributed by seanofw +Version 6.9.4 +------------- + +* NEW API: RegSet (set of regexes) +* Fixed CVE-2019-19012 +* Fixed CVE-2019-19203 (Does not affect UTF-8, UTF-16 and UTF-32 encodings) +* Fixed CVE-2019-19204 (Affects only PosixBasic, Emacs and Grep syntaxes) +* Fixed CVE-2019-19246 +* Fixed some problems (found by libFuzzer test) + + Version 6.9.3 (security fix release) ------------------------------------ * Fixed CVE-2019-13224 * Fixed CVE-2019-13225 -* Fixed many problems (found by libfuzzer programs) +* Fixed CVE-2019-16163 +* Fixed many problems (found by libFuzzer test) Version 6.9.2 (Reiwa) --------------------- * add doc/SYNTAX.md +* Direct threaded code (for GCC and Clang) * Update Unicode version 12.1.0 * NEW: Unicode Text Segment mode option (?y{g}) (?y{w}) (*original) - g: Extended Grapheme Cluster mode / w: Word mode - - (Unicode Standard Annex #29 [http://unicode.org/reports/tr29/]) - Version 6.9.1 ------------- @@ -118,7 +127,7 @@ Version 6.5.0 * NEW: \O (true anychar) * NEW: if-then-else (?(...)...\|...) * NEW: Backreference validity checker (?(xxx)) (*original) -* NEW: Absent repeater (?~absent) \[is equal to (?\~\|absent|\O*)] +* NEW: Absent repeater (?~absent) \[is equal to (?\~\|(?:absent)|\O*)] * NEW: Absent expression (?~|absent|expr) (*original) * NEW: Absent stopper (?~|absent) (*original) @@ -244,15 +253,18 @@ Sample Programs |File |Description | |:---------------------|:-----------------------------------------| +|sample/callout.c |example of callouts | +|sample/count.c |example of built-in callout *COUNT | +|sample/echo.c |example of user defined callouts of name | +|sample/encode.c |example of some encodings | +|sample/listcap.c |example of the capture history | +|sample/names.c |example of the named group callback | +|sample/posix.c |POSIX API sample | +|sample/regset.c |example of using RegSet API | +|sample/scan.c |example of using onig_scan() | |sample/simple.c |example of the minimum (Oniguruma API) | -|sample/names.c |example of the named group callback. | -|sample/encode.c |example of some encodings. | -|sample/listcap.c |example of the capture history. | -|sample/posix.c |POSIX API sample. | -|sample/scan.c |example of using onig_scan(). | -|sample/sql.c |example of the variable meta characters. | -|sample/user_property.c|example of user defined Unicode property. | -|sample/callout.c |example of callouts. | +|sample/sql.c |example of the variable meta characters | +|sample/user_property.c|example of user defined Unicode property | Test Programs diff --git a/build_harnesses.sh b/build_harnesses.sh deleted file mode 100755 index 54dc9ff..0000000 --- a/build_harnesses.sh +++ /dev/null @@ -1,31 +0,0 @@ -#!/bin/bash - -make clean -autoreconf -vfi - -# build the library with ASAN -#NO_LINK="-fsanitize=fuzzer-no-link" -NO_LINK="" -./configure CC=clang LD=clang CFLAGS="-g -fsanitize=address -fno-omit-frame-pointer $NO_LINK" LDFLAGS="-g -fsanitize=address -fno-omit-frame-pointer $NO_LINK" -make -j4 - -OUT=`pwd`/fuzzers -mkdir -p $OUT -LIBFUZZER_FLAGS="-fsanitize=fuzzer,address -fno-omit-frame-pointer" -#LIBS="src/.libs/libonig.a" -LIBS="src/.libs/libonig.a /usr/local/lib/libLLVMFuzzerMain.a" - -CFLAGS="-Isrc -g $LIBFUZZER_FLAGS" - -# Libfuzzer builds -clang++ contributed/libfuzzer-onig.cpp $LIBS $CFLAGS -o $OUT/libfuzzer-onig -clang harnesses/syntax-harness.c $LIBS $CFLAGS -o $OUT/syntax-libfuzzer -clang harnesses/encode-harness.c $LIBS $CFLAGS -o $OUT/encode-libfuzzer -clang harnesses/deluxe-encode-harness.c $LIBS $CFLAGS -o $OUT/deluxe-encode-libfuzzer - -clang -DUTF16_BE harnesses/encode-harness.c $LIBS $CFLAGS -o $OUT/utf16-be-libfuzzer -clang -DUTF16_LE harnesses/encode-harness.c $LIBS $CFLAGS -o $OUT/utf16-le-libfuzzer -clang -DWITH_READ_MAIN harnesses/encode-harness.c src/.libs/libonig.a $CFLAGS -o $OUT/main-encode -clang -DWITH_READ_MAIN -DUTF16_LE harnesses/encode-harness.c src/.libs/libonig.a $CFLAGS -o $OUT/main-utf16-le -clang -DWITH_READ_MAIN -DUTF16_BE harnesses/encode-harness.c src/.libs/libonig.a $CFLAGS -o $OUT/main-utf16-be -clang -DWITH_READ_MAIN harnesses/deluxe-encode-harness.c $LIBS $CFLAGS -o $OUT/main-deluxe-encode diff --git a/configure.ac b/configure.ac index 62c9fa5..ac51e85 100644 --- a/configure.ac +++ b/configure.ac @@ -1,5 +1,5 @@ dnl Process this file with autoconf to produce a configure script. -AC_INIT(onig, 6.9.3) +AC_INIT(onig, 6.9.4) AC_CONFIG_MACRO_DIR([m4]) diff --git a/contributed/libfuzzer-onig.cpp b/contributed/libfuzzer-onig.cpp deleted file mode 100644 index 526c826..0000000 --- a/contributed/libfuzzer-onig.cpp +++ /dev/null @@ -1,45 +0,0 @@ -/* libfuzzer test code for oniguruma - * author: Hanno Böck, license: CC0/public domain - -Usage: -* compile oniguruma with something like - ./configure CC=clang LD=clang CFLAGS="-fsanitize-coverage=edge -fsanitize=address" \ - LDFLAGS="-fsanitize-coverage=edge -fsanitize=address" -* Compile libfuzzer stub and link against static libonig.a and libFuzzer.a: - clang++ libfuzzer-onig.cpp src/.libs/libonig.a libFuzzer.a -o libfuzzer-onig \ - -fsanitize-coverage=edge -fsanitize=address -* Put sample patterns in directory "in/" -* Run - ./libfuzzer-onig in - -Consult libfuzzer docs for further details and how to create libFuzzer.a: -http://llvm.org/docs/LibFuzzer.html - - */ -#include -#include -#include - -extern "C" int LLVMFuzzerTestOneInput(const uint8_t * Data, size_t Size) -{ - regex_t *reg; - OnigEncoding enc; - - enc = ONIG_ENCODING_UTF8; - -#ifdef FULL_TEST - onig_initialize(&enc, 1); - onig_set_retry_limit_in_match(120); - onig_set_parse_depth_limit(120); -#endif - - if (onig_new(®, Data, Data + Size, ONIG_OPTION_DEFAULT, enc, - ONIG_SYNTAX_DEFAULT, 0) == 0) - onig_free(reg); - -#ifdef FULL_TEST - onig_end(); -#endif - - return 0; -} diff --git a/contributed/makefile b/contributed/makefile deleted file mode 100644 index f44a3c0..0000000 --- a/contributed/makefile +++ /dev/null @@ -1,21 +0,0 @@ -ONIG_LIB=../src/.libs/libonig.a -LIBS=$(ONIG_LIB) /usr/local/lib/libLLVMFuzzerMain.a - -TARGETS=libfuzzer-onig libfuzzer-onig-full - -default: $(TARGETS) - -libfuzzer-onig: libfuzzer-onig.cpp $(ONIG_LIB) - clang++ $< $(LIBS) -o $@ -fsanitize-coverage=trace-pc-guard -fsanitize=fuzzer,address - -libfuzzer-onig-full: libfuzzer-onig.cpp $(ONIG_LIB) - clang++ -DFULL_TEST $< $(LIBS) -o $@ -fsanitize-coverage=trace-pc-guard -fsanitize=fuzzer,address - - -$(ONIG_LIB): - cd ..; ./configure CC=clang LD=clang CFLAGS="-g -fsanitize=fuzzer,address" LDFLAGS="-fsanitize-coverage=trace-pc-guard -fsanitize=fuzzer,address"; make - - - -clean: - rm -f $(TARGETS) diff --git a/doc/API b/doc/API index 049db02..43d5338 100644 --- a/doc/API +++ b/doc/API @@ -1,4 +1,4 @@ -Oniguruma API Version 6.9.3 2019/07/06 +Oniguruma API Version 6.9.4 2019/09/30 #include @@ -168,7 +168,7 @@ Oniguruma API Version 6.9.3 2019/07/06 # int onig_new_deluxe(regex_t** reg, const UChar* pattern, const UChar* pattern_end, OnigCompileInfo* ci, OnigErrorInfo* einfo) - This function is deprecate, and it does not allow the case where + This function is deprecated, and it does not allow the case where the encoding of pattern and target is different. Create a regex object. @@ -306,6 +306,7 @@ Oniguruma API Version 6.9.3 2019/07/06 normal return: match position offset (i.e. p - str >= 0) not found: ONIG_MISMATCH (< 0) + error: error code (< 0) arguments 1 reg: regex object @@ -342,7 +343,8 @@ Oniguruma API Version 6.9.3 2019/07/06 Do not pass invalid byte string in the regex character encoding. normal return: match length (>= 0) - not match: ONIG_MISMATCH ( < 0) + not match: ONIG_MISMATCH (< 0) + error: error code (< 0) arguments 1 reg: regex object @@ -391,6 +393,136 @@ Oniguruma API Version 6.9.3 2019/07/06 7 callback_arg: optional argument passed to callback +# int onig_regset_new(OnigRegSet** rset, int n, regex_t* regs[]) + + Create a regset object. + All regex objects must have the same character encoding. + All regex objects are prohibited from having the ONIG_OPTION_FIND_LONGEST option. + + arguments + 1 rset: return address of regset object + 2 n: number of regex in regs + 3 regs: array of regex + + normal return: ONIG_NORMAL + + +# int onig_regset_add(OnigRegSet* set, regex_t* reg) + + Add a regex into regset. + The regex object must have the same character encoding with the regset. + The regex object is prohibited from having the ONIG_OPTION_FIND_LONGEST option. + + arguments + 1 set: regset object + 2 reg: regex object + + normal return: ONIG_NORMAL + + +# int onig_regset_replace(OnigRegSet* set, int at, regex_t* reg) + + Replace a regex in regset with another one. + If the reg argument value is NULL, then remove at-th regex. (and indexes of other regexes are changed) + + arguments + 1 set: regset object + 2 at: index of regex (zero origin) + 3 reg: regex object + + normal return: ONIG_NORMAL + + +# void onig_regset_free(OnigRegSet* set) + + Free memory used by regset object and regex objects in the regset. + If the same regex object is registered twice, the situation becomes destructive. + + arguments + 1 set: regset object + + +# int onig_regset_number_of_regex(OnigRegSet* set) + + Returns number of regex objects in the regset. + + arguments + 1 set: regset object + + +# regex_t* onig_regset_get_regex(OnigRegSet* set, int at) + + Returns the regex object corresponding to the at-th regex. + + arguments + 1 set: regset object + 2 at: index of regex array (zero origin) + + +# OnigRegion* onig_regset_get_region(OnigRegSet* set, int at) + + Returns the region object corresponding to the at-th regex. + + arguments + 1 set: regset object + 2 at: index of regex array (zero origin) + + +# int onig_regset_search(OnigRegSet* set, const OnigUChar* str, const OnigUChar* end, const OnigUChar* start, const OnigUChar* range, OnigRegSetLead lead, OnigOptionType option, int* rmatch_pos) + + Perform a search with regset. + + return value: + normal return: index of match regex (zero origin) + not found: ONIG_MISMATCH (< 0) + error: error code (< 0) + + arguments + 1 set: regset object + 2 str: target string + 3 end: terminate address of target string + 4 start: search start address of target string + 5 range: search terminate address of target string + 6 lead: outer loop element + ONIG_REGSET_POSITION_LEAD (returns most left position) + ONIG_REGSET_REGEX_LEAD (returns most left position) + ONIG_REGSET_PRIORITY_TO_REGEX_ORDER (returns first match regex) + 7 option: search time option + ONIG_OPTION_NOTBOL string head(str) isn't considered as begin of line + ONIG_OPTION_NOTEOL string end (end) isn't considered as end of line + 8 rmatch_pos: return address of match position (match_address - str) + + * ONIG_REGSET_POSITION_LEAD and ONIG_REGSET_REGEX_LEAD return the same result. + These differences only appear in search time. + In most cases, ONIG_REGSET_POSITION_LEAD seems to be faster. + + +# int onig_regset_search_with_param(OnigRegSet* set, const OnigUChar* str, const OnigUChar* end, const OnigUChar* start, const OnigUChar* range, OnigRegSetLead lead, OnigOptionType option, OnigMatchParam* mps[], int* rmatch_pos) + + Perform a search with regset and match-params. + + return value: + normal return: index of match regex (zero origin) + not found: ONIG_MISMATCH (< 0) + error: error code (< 0) + + arguments + 1 set: regset object + 2 str: target string + 3 end: terminate address of target string + 4 start: search start address of target string + 5 range: search terminate address of target string + 6 lead: outer loop element + ONIG_REGSET_POSITION_LEAD (returns most left position) + ONIG_REGSET_REGEX_LEAD (returns most left position) + ONIG_REGSET_PRIORITY_TO_REGEX_ORDER (returns first match regex) + 7 option: search time option + ONIG_OPTION_NOTBOL string head(str) isn't considered as begin of line + ONIG_OPTION_NOTEOL string end (end) isn't considered as end of line + 8 mps: array of match-params + 9 rmatch_pos: return address of match position (match_address - str) + + # OnigRegion* onig_region_new(void) Create a region. diff --git a/doc/API.ja b/doc/API.ja index 5871558..10ee1cd 100644 --- a/doc/API.ja +++ b/doc/API.ja @@ -1,4 +1,4 @@ -鬼車インターフェース Version 6.9.3 2019/07/06 +鬼車インターフェース Version 6.9.4 2019/09/30 #include @@ -390,6 +390,138 @@ 7 callback_arg: コールバック関数に渡される付加引数値 +# int onig_regset_new(OnigRegSet** rset, int n, regex_t* regs[]) + + regsetオブジェクトを生成する。 + 全ての正規表現オブジェクトは、同じ文字エンコーディングでなければならない。 + 全ての正規表現オブジェクトは、ONIG_OPTION_FIND_LONGESTオプションでコンパイルされていてはならない。 + + 引数 + 1 rset: regsetオブジェクトを返すためのアドレス + 2 n: 正規表現の個数 + 3 regs: 正規表現オブジェクトの配列 + + 正常終了戻り値: ONIG_NORMAL + + +# int onig_regset_add(OnigRegSet* set, regex_t* reg) + + regsetオブジェクトに正規表現を追加する。 + 正規表現オブジェクトは、regsetと同じ文字エンコーディングでなければならない。 + 正規表現オブジェクトは、ONIG_OPTION_FIND_LONGESTオプションでコンパイルされていてはならない。 + + 引数 + 1 set: regsetオブジェクト + 2 reg: 正規表現オブジェクト + + 正常終了戻り値: ONIG_NORMAL + + +# int onig_regset_replace(OnigRegSet* set, int at, regex_t* reg) + + regsetの中の一個の正規表現オブジェクトを別のものに変更する。 + 若しreg引数の値がNULLであれば、at番目の正規表現オブジェクトを外す。(そして、以降の正規表現オブジェクトのインデックスは変化する) + + 引数 + 1 set: regsetオブジェクト + 2 at: 変更する場所のインデックス + 2 reg: 正規表現オブジェクト + + 正常終了戻り値: ONIG_NORMAL + + +# void onig_regset_free(OnigRegSet* set) + + regsetオブジェクトとその中の正規表現オブジェクトの使用メモリを開放する。 + 若し、同一の正規表現オブジェクトを重複して登録していれば、破壊的な状況になる。 + + 引数 + 1 set: regsetオブジェクト + + +# int onig_regset_number_of_regex(OnigRegSet* set) + + regsetの中の正規表現オブジェクトの個数を返す。 + + 引数 + 1 set: regsetオブジェクト + + +# regex_t* onig_regset_get_regex(OnigRegSet* set, int at) + + regsetのat番目の正規表現を返す。 + + 引数 + 1 set: regsetオブジェクト + 2 at: 正規表現オブジェクトのインデックス (ゼロ開始) + + +# OnigRegion* onig_regset_get_region(OnigRegSet* set, int at) + + regsetのat番目の正規表現に対応する領域を返す。 + + 引数 + 1 set: regsetオブジェクト + 2 at: 正規表現オブジェクトのインデックス (ゼロ開始) + + +# int onig_regset_search(OnigRegSet* set, const OnigUChar* str, const OnigUChar* end, const OnigUChar* start, const OnigUChar* range, OnigRegSetLead lead, OnigOptionType option, int* rmatch_pos) + + regsetによる検索を実行する。 + + 戻り値: + 検索成功: マッチした正規表現オブジェクトのインデックス (ゼロ開始) + 検索失敗: ONIG_MISMATCH (< 0) + エラー: エラーコード (< 0) + + 引数 + 1 set: regsetオブジェクト + 2 str: 検索対象文字列 + 3 end: 検索対象文字列の終端アドレス + 4 start: 検索対象文字列の検索先頭位置アドレス + 5 range: 検索対象文字列の検索終了位置アドレス + (start <= 探索される文字列 < range) + 6 lead: 外側のループ要素 + ONIG_REGSET_POSITION_LEAD (最左位置でマッチした結果を返す) + ONIG_REGSET_REGEX_LEAD (最左位置でマッチした結果を返す) + ONIG_REGSET_PRIORITY_TO_REGEX_ORDER (最初にマッチした正規表現の結果を返す) + 7 option: 検索時オプション + ONIG_OPTION_NOTBOL 文字列の先頭(str)を行頭と看做さない + ONIG_OPTION_NOTEOL 文字列の終端(end)を行末と看做さない + 8 rmatch_pos: マッチした位置を返すためのアドレス (match_address - str) + + * ONIG_REGSET_POSITION_LEADとONIG_REGSET_REGEX_LEADは同じ結果を返す。 + これらの違いは検索時間にしか現れない。 + ほとんどの場合、ONIG_REGSET_POSITION_LEADのほうが速いと思われる。 + + +# int onig_regset_search_with_param(OnigRegSet* set, const OnigUChar* str, const OnigUChar* end, const OnigUChar* start, const OnigUChar* range, OnigRegSetLead lead, OnigOptionType option, OnigMatchParam* mps[], int* rmatch_pos) + + regsetとOnigMatchParamオブジェクトによる検索を実行する。 + + 戻り値: + 検索成功: マッチした正規表現オブジェクトのインデックス (ゼロ開始) + 検索失敗: ONIG_MISMATCH (< 0) + エラー: エラーコード (< 0) + + 引数 + 1 set: regsetオブジェクト + 2 str: 検索対象文字列 + 3 end: 検索対象文字列の終端アドレス + 4 start: 検索対象文字列の検索先頭位置アドレス + 5 range: 検索対象文字列の検索終了位置アドレス + (start <= 探索される文字列 < range) + 6 lead: 外側のループ要素 + ONIG_REGSET_POSITION_LEAD (最左位置でマッチした結果を返す) + ONIG_REGSET_REGEX_LEAD (最左位置でマッチした結果を返す) + ONIG_REGSET_PRIORITY_TO_REGEX_ORDER (最初にマッチした正規表現の結果を返す) + 7 option: 検索時オプション + ONIG_OPTION_NOTBOL 文字列の先頭(str)を行頭と看做さない + ONIG_OPTION_NOTEOL 文字列の終端(end)を行末と看做さない + 8 mps: OnigMatchParamオブジェクトの配列 + 9 rmatch_pos: マッチした位置を返すためのアドレス (match_address - str) + + # OnigRegion* onig_region_new(void) マッチ領域情報(region)を作成する。 diff --git a/doc/RE b/doc/RE index 72957dd..599d2a6 100644 --- a/doc/RE +++ b/doc/RE @@ -1,4 +1,4 @@ -Oniguruma Regular Expressions Version 6.9.2 2019/03/29 +Oniguruma Regular Expressions Version 6.9.4 2019/10/31 syntax: ONIG_SYNTAX_ONIGURUMA (default) @@ -289,6 +289,11 @@ syntax: ONIG_SYNTAX_ONIGURUMA (default) In negative look-behind, capturing group isn't allowed, but non-capturing group (?:) is allowed. + * In look-behind and negative look-behind, support for + ignore-case option is limited. Only supports conversion + between single characters. (Does not support conversion + of multiple characters in Unicode) + (?>subexp) atomic group no backtracks in subexp. @@ -338,7 +343,7 @@ syntax: ONIG_SYNTAX_ONIGURUMA (default) This works like .* (more precisely \O*), but it is limited by the range that does not include the string match with . - This is a written abbreviation of (?~|absent|\O*). + This is a written abbreviation of (?~|(?:absent)|\O*). \O* is used as a repeater. (?~|absent|exp) Absent expression (* original) diff --git a/doc/RE.ja b/doc/RE.ja index c09e237..2156d93 100644 --- a/doc/RE.ja +++ b/doc/RE.ja @@ -1,4 +1,4 @@ -鬼車 正規表現 Version 6.9.2 2019/03/29 +鬼車 正規表現 Version 6.9.4 2019/10/31 使用文法: ONIG_SYNTAX_ONIGURUMA (既定値) @@ -21,10 +21,10 @@ \f 改頁 (0x0C) \a 鐘 (0x07) \e 退避修飾 (0x1B) - \nnn 八進数表現 符号化バイト値(の一部) + \nnn 八進数表現 符号化バイト値 \o{17777777777} 拡張八進数表現 コードポイント値 \uHHHH 拡張十六進数表現 コードポイント値 - \xHH 十六進数表現 符号化バイト値(の一部) + \xHH 十六進数表現 符号化バイト値 \x{7HHHHHHH} 拡張十六進数表現 コードポイント値 \cx 制御文字表現 コードポイント値 \C-x 制御文字表現 コードポイント値 @@ -284,6 +284,10 @@ 否定戻り読みでは、捕獲式集合は許されないが、 非捕獲式集合は許される。 + * 戻り読み、否定戻り読みの中では、ignore-caseオプションの + 対応が制限される。一文字と一文字の間の変換しか対応しない。 + (Unicodeでの複数文字の変換に対応しない) + (?>式) 原子的式集合 式全体を通過したとき、式の中での後退再試行を行なわない @@ -334,20 +338,20 @@ <不在機能群> - (?~不在式) 不在繰り返し (*原案 田中哲) - これは .*(より正確には\O*)のように動作するが、<不在式>に + (?~不在) 不在繰り返し (*原案 田中哲) + これは .*(より正確には\O*)のように動作するが、<不在>に 適合する文字列を含まない範囲に制限される。 - これは(?~|不在式|\O*)の省略表記である。 + これは(?~|(?:不在)|\O*)の省略表記である。 - (?~|不在式|式) 不在式 (* 原作) - これは<式>のように動作するが、<不在式>に適合する文字列を + (?~|不在|式) 不在式 (* 原作) + これは<式>のように動作するが、<不在>に適合する文字列を 含まない範囲に制限される。 例 (?~|345|\d*) "12345678" ==> "12", "1", "" - (?~|不在式) 不在停止 (* 原作) + (?~|不在) 不在停止 (* 原作) この演算子を通過した後は、対象文字列の適合範囲が - <不在式>に適合する文字列を含まない範囲に制限される。 + <不在>に適合する文字列を含まない範囲に制限される。 (?~|) 範囲消去 不在停止の効果を消して、それ以前の状態にする。 diff --git a/doc/SYNTAX.md b/doc/SYNTAX.md index 449f262..69ecf3a 100644 --- a/doc/SYNTAX.md +++ b/doc/SYNTAX.md @@ -1,7 +1,7 @@ # Oniguruma syntax (operator) configuration -_Documented for Oniguruma 6.9.2 (2019/03/28)_ +_Documented for Oniguruma 6.9.3 (2019/08/08)_ ---------- @@ -960,6 +960,12 @@ _Set in: Ruby, Oniguruma_ If this flag is set, Oniguruma will warn about nested repeat operators those have no meaning, like `(?:a*)+`. If this flag is clear, Oniguruma will allow the nested repeat operators without warning about them. +### 26. ONIG_SYN_ALLOW_INVALID_CODE_END_OF_RANGE_IN_CC (allow [a-\x{7fffffff}]) + +_Set in: Oniguruma_ + +If this flag is set, then invalid code points at the end of range in character class are allowed. + ### 31. ONIG_SYN_CONTEXT_INDEP_ANCHORS _Set in: PosixExtended, GnuRegex, Java, Perl, Perl_NG, Ruby, Oniguruma_ @@ -1066,4 +1072,5 @@ These tables show which of the built-in syntaxes use which flags and options, fo | 23 | `ONIG_SYN_ALLOW_DOUBLE_RANGE_OP_IN_CC` | - | Yes | - | - | Yes | Yes | Yes | Yes | Yes | Yes | | 24 | `ONIG_SYN_WARN_CC_OP_NOT_ESCAPED` | - | - | - | - | - | - | - | - | Yes | Yes | | 25 | `ONIG_SYN_WARN_REDUNDANT_NESTED_REPEAT` | - | - | - | - | - | - | - | - | Yes | Yes | +| 26 | `ONIG_SYN_ALLOW_INVALID_CODE_END_OF_RANGE_IN_CC` | - | - | - | - | - | - | - | - | - | Yes | | 31 | `ONIG_SYN_CONTEXT_INDEP_ANCHORS` | - | Yes | - | - | Yes | Yes | Yes | Yes | Yes | Yes | diff --git a/doc/UNICODE_PROPERTIES b/doc/UNICODE_PROPERTIES index ff2a6ce..24c2031 100644 --- a/doc/UNICODE_PROPERTIES +++ b/doc/UNICODE_PROPERTIES @@ -1,4 +1,4 @@ -Unicode Properties (from Unicode Version: 12.1.0) +Unicode Properties (Unicode Version: 12.1.0, Emoji: 12.1) 15: ASCII_Hex_Digit 16: Adlam diff --git a/harnesses/ascii_compatible.dict b/harnesses/ascii_compatible.dict index 820bf47..e6e00db 100644 --- a/harnesses/ascii_compatible.dict +++ b/harnesses/ascii_compatible.dict @@ -1,10 +1,7 @@ # First-pass fuzzing dictionary for Oniguruma by Mark Griffin -"\\o{17777777777}" -"\\777" -"\\u" -"\\uFFFF" -"\\xFF" -"\\x{70000000}" +"\\o{34}" +"\\123" +"\\x{40}" "\\C-" "\\M-\\C-" "\\X" @@ -12,6 +9,8 @@ "\\p{^" "}" "]" +"]" +")" ")" "\\n" "\\r" @@ -47,10 +46,13 @@ "\\B" "(?y{" "[abcd1-9]" +"[\\w]" +"[\\W]" +"[\\s]" +"[\\S]" "[\\w\\d" "[\\p{Alphabetic}" -"[\\P{Arabic}" -"[\\x{ffff}" +"[\\x{03}" "[a-w&&" "[^" "[:graph:]" @@ -88,7 +90,6 @@ "(?())" "(?())" "(?())" -"(*ERROR{-2000})" "(*COUNT[tag]{X})" "\\1" "\\2" @@ -106,6 +107,5 @@ "(?a|b\\gc)" "(?-i:\\g)" "\\N{name}" -"\\p{Hiragana}" "\\p{Katakana}" "\\p{Emoji}" diff --git a/harnesses/deluxe-encode-harness.c b/harnesses/deluxe-encode-harness.c index e1f84a5..aabe916 100644 --- a/harnesses/deluxe-encode-harness.c +++ b/harnesses/deluxe-encode-harness.c @@ -49,39 +49,6 @@ search(regex_t* reg, unsigned char* str, unsigned char* end) return 0; } -static int -exec(OnigEncoding enc, OnigOptionType options, - char* apattern, char* apattern_end, char* astr, char* astr_end) -{ - int r; - regex_t* reg; - OnigErrorInfo einfo; - UChar* pattern = (UChar* )apattern; - UChar* str = (UChar* )astr; - UChar* pattern_end = (UChar* )apattern_end; - unsigned char *end = (unsigned char* )astr_end; - - onig_initialize(&enc, 1); - onig_set_retry_limit_in_match(DEFAULT_LIMIT); - onig_set_parse_depth_limit(DEFAULT_LIMIT); - - r = onig_new(®, pattern, pattern_end, - options, enc, ONIG_SYNTAX_DEFAULT, &einfo); - if (r != ONIG_NORMAL) { - char s[ONIG_MAX_ERROR_MESSAGE_LEN]; - onig_error_code_to_str((UChar* )s, r, &einfo); - fprintf(stdout, "ERROR: %s\n", s); - onig_end(); - return -1; - } - - r = search(reg, str, end); - - onig_free(reg); - onig_end(); - return 0; -} - static OnigCaseFoldType CF = ONIGENC_CASE_FOLD_MIN; static int @@ -196,15 +163,13 @@ int LLVMFuzzerTestOneInput(const uint8_t * Data, size_t Size) remaining_size--; // copy first PATTERN_SIZE bytes off to be the pattern - pattern = (unsigned char *)malloc(PATTERN_SIZE+4); - memset(pattern, 0, PATTERN_SIZE+4); + pattern = (unsigned char *)malloc(PATTERN_SIZE); memcpy(pattern, data, PATTERN_SIZE); pattern_end = pattern + PATTERN_SIZE; data += PATTERN_SIZE; remaining_size -= PATTERN_SIZE; - str = (unsigned char*)malloc(remaining_size+4); - memset(str, 0, remaining_size+4); + str = (unsigned char*)malloc(remaining_size); memcpy(str, data, remaining_size); str_end = str + remaining_size; diff --git a/harnesses/encode-harness.c b/harnesses/encode-harness.c index e57fd4f..5db0512 100644 --- a/harnesses/encode-harness.c +++ b/harnesses/encode-harness.c @@ -3,13 +3,19 @@ * contributed by Mark Griffin */ #include -#include "oniguruma.h" - +#include #include #include +#include +#include +#include +#include -#define PARSE_DEPTH_LIMIT 120 -#define RETRY_LIMIT 4000 +#include "oniguruma.h" + + +//#define PARSE_DEPTH_LIMIT 120 +#define RETRY_LIMIT 3500 typedef unsigned char uint8_t; @@ -26,6 +32,7 @@ search(regex_t* reg, unsigned char* str, unsigned char* end) range = end; r = onig_search(reg, str, end, start, range, region, ONIG_OPTION_NONE); if (r >= 0) { +#ifdef WITH_READ_MAIN int i; fprintf(stdout, "match at %d (%s)\n", r, @@ -33,17 +40,29 @@ search(regex_t* reg, unsigned char* str, unsigned char* end) for (i = 0; i < region->num_regs; i++) { fprintf(stdout, "%d: (%d-%d)\n", i, region->beg[i], region->end[i]); } +#endif } else if (r == ONIG_MISMATCH) { +#ifdef WITH_READ_MAIN fprintf(stdout, "search fail (%s)\n", ONIGENC_NAME(onig_get_encoding(reg))); +#endif } else { /* error */ +#ifdef WITH_READ_MAIN char s[ONIG_MAX_ERROR_MESSAGE_LEN]; + onig_error_code_to_str((UChar* )s, r); fprintf(stdout, "ERROR: %s\n", s); fprintf(stdout, " (%s)\n", ONIGENC_NAME(onig_get_encoding(reg))); +#endif onig_region_free(region, 1 /* 1:free self, 0:free contents only */); + + if (r == ONIGERR_STACK_BUG || + r == ONIGERR_UNDEFINED_BYTECODE || + r == ONIGERR_UNEXPECTED_BYTECODE) + return -2; + return -1; } @@ -51,8 +70,14 @@ search(regex_t* reg, unsigned char* str, unsigned char* end) return 0; } +static long INPUT_COUNT; +static long EXEC_COUNT; +static long EXEC_COUNT_INTERVAL; +static long REGEX_SUCCESS_COUNT; +static long VALID_STRING_COUNT; + static int -exec(OnigEncoding enc, OnigOptionType options, +exec(OnigEncoding enc, OnigOptionType options, OnigSyntaxType* syntax, char* apattern, char* apattern_end, char* astr, UChar* end) { int r; @@ -62,22 +87,41 @@ exec(OnigEncoding enc, OnigOptionType options, UChar* str = (UChar* )astr; UChar* pattern_end = (UChar* )apattern_end; + EXEC_COUNT++; + EXEC_COUNT_INTERVAL++; + onig_initialize(&enc, 1); onig_set_retry_limit_in_match(RETRY_LIMIT); - onig_set_parse_depth_limit(PARSE_DEPTH_LIMIT); + //onig_set_parse_depth_limit(PARSE_DEPTH_LIMIT); r = onig_new(®, pattern, pattern_end, - options, enc, ONIG_SYNTAX_DEFAULT, &einfo); + options, enc, syntax, &einfo); if (r != ONIG_NORMAL) { char s[ONIG_MAX_ERROR_MESSAGE_LEN]; onig_error_code_to_str((UChar* )s, r, &einfo); +#ifdef WITH_READ_MAIN fprintf(stdout, "ERROR: %s\n", s); +#endif onig_end(); - return -1; + + if (r == ONIGERR_PARSER_BUG || + r == ONIGERR_STACK_BUG || + r == ONIGERR_UNDEFINED_BYTECODE || + r == ONIGERR_UNEXPECTED_BYTECODE) { + return -2; + } + else + return -1; } + REGEX_SUCCESS_COUNT++; + + r = search(reg, pattern, pattern_end); + if (r == -2) return -2; if (onigenc_is_valid_mbc_string(enc, str, end) != 0) { + VALID_STRING_COUNT++; r = search(reg, str, end); + if (r == -2) return -2; } onig_free(reg); @@ -85,52 +129,114 @@ exec(OnigEncoding enc, OnigOptionType options, return 0; } -#define PATTERN_SIZE 32 -#define NUM_CONTROL_BYTES 1 -#define MIN_STR_SIZE 1 -int LLVMFuzzerTestOneInput(const uint8_t * Data, size_t Size) +#if 0 +static void +output_data(char* path, const uint8_t * data, size_t size) { - if (Size <= (NUM_CONTROL_BYTES + PATTERN_SIZE + MIN_STR_SIZE)) - return 0; - if (Size > 0x1000) - return 0; + int fd; + ssize_t n; + fd = open(path, O_CREAT|O_RDWR, S_IRUSR|S_IRGRP|S_IROTH); + if (fd == -1) { + fprintf(stderr, "ERROR: output_data(): can't open(%s)\n", path); + return ; + } + + n = write(fd, (const void* )data, size); + if (n != size) { + fprintf(stderr, "ERROR: output_data(): n: %ld, size: %ld\n", n, size); + } + close(fd); +} +#endif + + +static int +alloc_exec(OnigEncoding enc, OnigOptionType options, OnigSyntaxType* syntax, + int pattern_size, size_t remaining_size, unsigned char *data) +{ + int r; unsigned char *pattern_end; unsigned char *str_null_end; - size_t remaining_size = Size; - unsigned char *data = (unsigned char *)(Data); + // copy first PATTERN_SIZE bytes off to be the pattern + unsigned char *pattern = (unsigned char *)malloc(pattern_size != 0 ? pattern_size : 1); + memcpy(pattern, data, pattern_size); + pattern_end = pattern + pattern_size; + data += pattern_size; + remaining_size -= pattern_size; - // pull off one byte to switch off - unsigned char encoding_choice = data[0]; - data++; - remaining_size--; +#if defined(UTF16_BE) || defined(UTF16_LE) + if (remaining_size % 2 == 1) remaining_size--; +#endif - // copy first PATTERN_SIZE bytes off to be the pattern - unsigned char *pattern = (unsigned char *)malloc(PATTERN_SIZE+4); - memset(pattern, 0, PATTERN_SIZE+4); - memcpy(pattern, data, PATTERN_SIZE); - pattern_end = pattern + PATTERN_SIZE; - data += PATTERN_SIZE; - remaining_size -= PATTERN_SIZE; - - unsigned char *str = (unsigned char*)malloc(remaining_size+4); - memset(str, 0, remaining_size+4); + unsigned char *str = (unsigned char*)malloc(remaining_size != 0 ? remaining_size : 1); memcpy(str, data, remaining_size); str_null_end = str + remaining_size; - int r; - OnigEncodingType *encodings[] = { - ONIG_ENCODING_SJIS, - ONIG_ENCODING_EUC_JP, - ONIG_ENCODING_CP1251, - ONIG_ENCODING_ISO_8859_1, - ONIG_ENCODING_UTF8, - ONIG_ENCODING_KOI8_R, - ONIG_ENCODING_BIG5 + r = exec(enc, options, syntax, + (char *)pattern, (char *)pattern_end, + (char *)str, str_null_end); + + free(pattern); + free(str); + return r; +} + + +#define EXEC_PRINT_INTERVAL 10000000 +#define MAX_PATTERN_SIZE 150 + +#ifdef SYNTAX_TEST +#define NUM_CONTROL_BYTES 3 +#else +#define NUM_CONTROL_BYTES 2 +#endif + +int LLVMFuzzerTestOneInput(const uint8_t * Data, size_t Size) +{ +#if !defined(UTF16_BE) && !defined(UTF16_LE) + static OnigEncoding encodings[] = { + ONIG_ENCODING_UTF8, + ONIG_ENCODING_UTF8, + ONIG_ENCODING_UTF8, + ONIG_ENCODING_SJIS, + //ONIG_ENCODING_EUC_JP, + ONIG_ENCODING_ISO_8859_1, + ONIG_ENCODING_BIG5, + ONIG_ENCODING_GB18030, + ONIG_ENCODING_EUC_TW + }; + unsigned char encoding_choice; +#endif + +#ifdef SYNTAX_TEST + static OnigSyntaxType* syntaxes[] = { + ONIG_SYNTAX_POSIX_EXTENDED, + ONIG_SYNTAX_EMACS, + ONIG_SYNTAX_GREP, + ONIG_SYNTAX_GNU_REGEX, + ONIG_SYNTAX_JAVA, + ONIG_SYNTAX_PERL_NG, + ONIG_SYNTAX_ONIGURUMA }; + unsigned char syntax_choice; +#endif + + int r; + int pattern_size; + size_t remaining_size; + unsigned char *data; + unsigned char options_choice; + OnigOptionType options; + OnigEncoding enc; + OnigSyntaxType* syntax; - OnigEncodingType *enc; + INPUT_COUNT++; + if (Size < NUM_CONTROL_BYTES) return 0; + + remaining_size = Size; + data = (unsigned char* )(Data); #ifdef UTF16_BE enc = ONIG_ENCODING_UTF16_BE; @@ -138,24 +244,113 @@ int LLVMFuzzerTestOneInput(const uint8_t * Data, size_t Size) #ifdef UTF16_LE enc = ONIG_ENCODING_UTF16_LE; #else + encoding_choice = data[0]; + data++; + remaining_size--; + int num_encodings = sizeof(encodings)/sizeof(encodings[0]); enc = encodings[encoding_choice % num_encodings]; #endif #endif - r = exec(enc, ONIG_OPTION_NONE, (char *)pattern, (char *)pattern_end, - (char *)str, str_null_end); +#ifdef SYNTAX_TEST + syntax_choice = data[0]; + data++; + remaining_size--; - free(pattern); - free(str); + int num_syntaxes = sizeof(syntaxes)/sizeof(syntaxes[0]); + syntax = syntaxes[syntax_choice % num_syntaxes]; +#else + syntax = ONIG_SYNTAX_DEFAULT; +#endif + + options_choice = data[0]; + options = (options_choice % 2 == 0) ? ONIG_OPTION_NONE : ONIG_OPTION_IGNORECASE; + data++; + remaining_size--; + +#ifdef WITH_READ_MAIN +#ifdef SYNTAX_TEST + fprintf(stdout, "enc: %s, syntax: %d, options: %u\n", + ONIGENC_NAME(enc), (int )(syntax_choice % num_syntaxes), options); +#else + fprintf(stdout, "enc: %s, options: %u\n", ONIGENC_NAME(enc), options); +#endif +#endif +#ifdef WITH_READ_MAIN + int max_pattern_size; + + if (remaining_size == 0) + max_pattern_size = 0; + else { + max_pattern_size = remaining_size - 1; + if (max_pattern_size > MAX_PATTERN_SIZE) + max_pattern_size = MAX_PATTERN_SIZE; + +#if defined(UTF16_BE) || defined(UTF16_LE) + if (max_pattern_size % 2 == 1) max_pattern_size--; +#endif + } + + for (pattern_size = 0; pattern_size <= max_pattern_size; ) { + fprintf(stdout, "pattern_size: %d\n", pattern_size); + r = alloc_exec(enc, options, syntax, pattern_size, remaining_size, data); + if (r == -2) { + //output_data("parser-bug", Data, Size); + exit(-2); + } + +#if defined(UTF16_BE) || defined(UTF16_LE) + pattern_size += 2; +#else + pattern_size++; +#endif + } + +#else /* WITH_READ_MAIN */ + + if (remaining_size == 0) + pattern_size = 0; + else { + pattern_size = INPUT_COUNT % remaining_size; + if (pattern_size > MAX_PATTERN_SIZE) + pattern_size = MAX_PATTERN_SIZE; + +#if defined(UTF16_BE) || defined(UTF16_LE) + if (pattern_size % 2 == 1) pattern_size--; +#endif + } + + r = alloc_exec(enc, options, syntax, pattern_size, remaining_size, data); + if (r == -2) { + //output_data("parser-bug", Data, Size); + exit(-2); + } +#endif /* else WITH_READ_MAIN */ + + if (EXEC_COUNT_INTERVAL == EXEC_PRINT_INTERVAL) { + char d[64]; + time_t t; + float fexec, freg, fvalid; + + t = time(NULL); + strftime(d, sizeof(d), "%m/%d %H:%M:%S", localtime(&t)); + + fexec = (float )EXEC_COUNT / INPUT_COUNT; + freg = (float )REGEX_SUCCESS_COUNT / INPUT_COUNT; + fvalid = (float )VALID_STRING_COUNT / INPUT_COUNT; + + fprintf(stdout, "%s: %ld: EXEC:%.2f, REG:%.2f, VALID:%.2f\n", + d, EXEC_COUNT, fexec, freg, fvalid); + + EXEC_COUNT_INTERVAL = 0; + } return r; } #ifdef WITH_READ_MAIN -#include - extern int main(int argc, char* argv[]) { size_t n; diff --git a/harnesses/libfuzzer-onig.cpp b/harnesses/libfuzzer-onig.cpp new file mode 100644 index 0000000..526c826 --- /dev/null +++ b/harnesses/libfuzzer-onig.cpp @@ -0,0 +1,45 @@ +/* libfuzzer test code for oniguruma + * author: Hanno Böck, license: CC0/public domain + +Usage: +* compile oniguruma with something like + ./configure CC=clang LD=clang CFLAGS="-fsanitize-coverage=edge -fsanitize=address" \ + LDFLAGS="-fsanitize-coverage=edge -fsanitize=address" +* Compile libfuzzer stub and link against static libonig.a and libFuzzer.a: + clang++ libfuzzer-onig.cpp src/.libs/libonig.a libFuzzer.a -o libfuzzer-onig \ + -fsanitize-coverage=edge -fsanitize=address +* Put sample patterns in directory "in/" +* Run + ./libfuzzer-onig in + +Consult libfuzzer docs for further details and how to create libFuzzer.a: +http://llvm.org/docs/LibFuzzer.html + + */ +#include +#include +#include + +extern "C" int LLVMFuzzerTestOneInput(const uint8_t * Data, size_t Size) +{ + regex_t *reg; + OnigEncoding enc; + + enc = ONIG_ENCODING_UTF8; + +#ifdef FULL_TEST + onig_initialize(&enc, 1); + onig_set_retry_limit_in_match(120); + onig_set_parse_depth_limit(120); +#endif + + if (onig_new(®, Data, Data + Size, ONIG_OPTION_DEFAULT, enc, + ONIG_SYNTAX_DEFAULT, 0) == 0) + onig_free(reg); + +#ifdef FULL_TEST + onig_end(); +#endif + + return 0; +} diff --git a/harnesses/makefile b/harnesses/makefile new file mode 100644 index 0000000..dfd84de --- /dev/null +++ b/harnesses/makefile @@ -0,0 +1,69 @@ +# makefile for harness +SRC = ../src +CFLAGS = -I$(SRC) -Wall -g -fsanitize=fuzzer,address -fno-omit-frame-pointer +CFLAGS_M = -I$(SRC) -Wall -g -fsanitize=fuzzer-no-link,address -fno-omit-frame-pointer -DWITH_READ_MAIN +ONIG_LIB = $(SRC)/.libs/libonig.a +LIBS = $(ONIG_LIB) + +TARGETS = encode-libfuzzer syntax-libfuzzer \ + utf16-be-libfuzzer utf16-le-libfuzzer main-encode main-syntax \ + main-utf16-be main-utf16-le main-regset regset-libfuzzer + +OTHER_TARGETS = libfuzzer-onig libfuzzer-onig-full \ + deluxe-encode-libfuzzer main-deluxe-encode + + +default: $(TARGETS) + +encode-libfuzzer: encode-harness.c $(ONIG_LIB) + clang $(CFLAGS) $< $(LIBS) -o $@ + +syntax-libfuzzer: encode-harness.c $(ONIG_LIB) + clang -DSYNTAX_TEST $(CFLAGS) $< $(LIBS) -o $@ + +deluxe-encode-libfuzzer: deluxe-encode-harness.c $(ONIG_LIB) + clang $(CFLAGS) $< $(LIBS) -o $@ + +utf16-be-libfuzzer: encode-harness.c $(ONIG_LIB) + clang -DUTF16_BE $(CFLAGS) $< $(LIBS) -o $@ + +utf16-le-libfuzzer: encode-harness.c $(ONIG_LIB) + clang -DUTF16_LE $(CFLAGS) $< $(LIBS) -o $@ + +regset-libfuzzer: regset-harness.c $(ONIG_LIB) + clang $(CFLAGS) $< $(LIBS) -o $@ + +main-encode: encode-harness.c $(ONIG_LIB) + clang $(CFLAGS_M) $< $(LIBS) -o $@ + +main-syntax: encode-harness.c $(ONIG_LIB) + clang -DSYNTAX_TEST $(CFLAGS_M) $< $(LIBS) -o $@ + +main-deluxe-encode: deluxe-encode-harness.c $(ONIG_LIB) + clang $(CFLAGS_M) $< $(LIBS) -o $@ + +main-utf16-be: encode-harness.c $(ONIG_LIB) + clang -DUTF16_BE $(CFLAGS_M) $< $(LIBS) -o $@ + +main-utf16-le: encode-harness.c $(ONIG_LIB) + clang -DUTF16_LE $(CFLAGS_M) $< $(LIBS) -o $@ + +main-regset: regset-harness.c $(ONIG_LIB) + clang $(CFLAGS_M) $< $(LIBS) -o $@ + +libfuzzer-onig: libfuzzer-onig.cpp $(ONIG_LIB) + clang++ $(CFLAGS) $< $(LIBS) -o $@ + +libfuzzer-onig-full: libfuzzer-onig.cpp $(ONIG_LIB) + clang++ -DFULL_TEST $(CFLAGS) $< $(LIBS) -o $@ + + +$(ONIG_LIB): + cd ..; make clean + #cd ..; autoreconf -vfi + cd ..; ./configure CC=clang LD=clang CFLAGS="-g -fsanitize=address -fno-omit-frame-pointer" LDFLAGS="-g -fsanitize=address -fno-omit-frame-pointer" + cd ..; make -j4 + + +clean: + rm -f $(TARGETS) $(OTHER_TARGETS) diff --git a/harnesses/regset-harness.c b/harnesses/regset-harness.c new file mode 100644 index 0000000..b4b7e20 --- /dev/null +++ b/harnesses/regset-harness.c @@ -0,0 +1,379 @@ +/* + * regset-harness.c + * Copyright (c) 2019 K.Kosako + */ +#include +#include +#include +#include +#include +#include +#include +#include + +#include "oniguruma.h" + + +#define RETRY_LIMIT 500 + +#ifdef WITH_READ_MAIN +//#define CHECK_EACH_REGEX_SEARCH_TIME +#endif + +#define MAX_REG_NUM 256 + +typedef unsigned char uint8_t; +static OnigEncoding ENC; + +#ifdef CHECK_EACH_REGEX_SEARCH_TIME +static double +get_sec(struct timespec* ts, struct timespec* te) +{ + double t; + + t = (te->tv_sec - ts->tv_sec) + + (double )(te->tv_nsec - ts->tv_nsec) / 1000000000.0; + return t; +} + +static int +check_each_regex_search_time(OnigRegSet* set, unsigned char* str, unsigned char* end) +{ + int n; + int i; + int r; + OnigRegion* region; + + n = onig_regset_number_of_regex(set); + region = onig_region_new(); + + for (i = 0; i < n; i++) { + regex_t* reg; + unsigned char* start; + unsigned char* range; + struct timespec ts1, ts2; + double t; + + reg = onig_regset_get_regex(set, i); + start = str; + range = end; + + clock_gettime(CLOCK_PROCESS_CPUTIME_ID, &ts1); + + r = onig_search(reg, str, end, start, range, region, ONIG_OPTION_NONE); + + clock_gettime(CLOCK_PROCESS_CPUTIME_ID, &ts2); + t = get_sec(&ts1, &ts2); + + fprintf(stdout, "regex search time %d: %6.2lfmsec.\n", i, t * 1000.0); + } + + onig_region_free(region, 1); + return 0; +} +#endif + +static int +search(OnigRegSet* set, OnigRegSetLead lead, unsigned char* str, unsigned char* end) +{ + int r; + int match_pos; + unsigned char *start, *range; + + start = str; + range = end; + r = onig_regset_search(set, str, end, start, range, lead, + ONIG_OPTION_NONE, &match_pos); + if (r >= 0) { +#ifdef WITH_READ_MAIN + int i; + int match_index; + OnigRegion* region; + + match_index = r; + fprintf(stdout, "match reg index: %d, pos: %d (%s)\n", + match_index, match_pos, ONIGENC_NAME(ENC)); + region = onig_regset_get_region(set, match_index); + if (region == 0) { + fprintf(stdout, "ERROR: can't get region.\n"); + return -1; + } + + for (i = 0; i < region->num_regs; i++) { + fprintf(stdout, "%d: (%d-%d)\n", i, region->beg[i], region->end[i]); + } +#endif + } + else if (r == ONIG_MISMATCH) { +#ifdef WITH_READ_MAIN + fprintf(stdout, "search fail (%s)\n", ONIGENC_NAME(ENC)); +#endif + } + else { /* error */ +#ifdef WITH_READ_MAIN + char s[ONIG_MAX_ERROR_MESSAGE_LEN]; + + onig_error_code_to_str((UChar* )s, r); + fprintf(stdout, "ERROR: %s\n", s); + fprintf(stdout, " (%s)\n", ONIGENC_NAME(ENC)); +#endif + return -1; + } + + return 0; +} + +static long INPUT_COUNT; +static long EXEC_COUNT; +static long EXEC_COUNT_INTERVAL; +static long REGEX_SUCCESS_COUNT; +static long VALID_STRING_COUNT; + +static int +exec(OnigEncoding enc, int reg_num, int init_reg_num, + UChar* pat[], UChar* pat_end[], + OnigRegSetLead lead, UChar* str, UChar* end) +{ + int r; + int i, j; + OnigRegSet* set; + regex_t* reg; + OnigOptionType options; + OnigErrorInfo einfo; + regex_t* regs[MAX_REG_NUM]; + + EXEC_COUNT++; + EXEC_COUNT_INTERVAL++; + + options = (EXEC_COUNT % 4 == 0) ? ONIG_OPTION_IGNORECASE : ONIG_OPTION_NONE; + + onig_initialize(&enc, 1); + onig_set_retry_limit_in_match(RETRY_LIMIT); + + for (i = 0; i < init_reg_num; i++) { + r = onig_new(®s[i], pat[i], pat_end[i], options, ENC, + ONIG_SYNTAX_DEFAULT, &einfo); + if (r != 0) { +#ifdef WITH_READ_MAIN + char s[ONIG_MAX_ERROR_MESSAGE_LEN]; + + onig_error_code_to_str((UChar* )s, r, &einfo); + fprintf(stdout, "ERROR: index: %d, %s\n", i, s); +#endif + + for (j = 0; j < i; j++) onig_free(regs[j]); + + onig_end(); + + if (r == ONIGERR_PARSER_BUG || + r == ONIGERR_STACK_BUG || + r == ONIGERR_UNDEFINED_BYTECODE || + r == ONIGERR_UNEXPECTED_BYTECODE) { + return -2; + } + else + return -1; + } + } + + r = onig_regset_new(&set, init_reg_num, regs); + if (r != 0) { + for (i = 0; i < init_reg_num; i++) { + onig_free(regs[i]); + } + onig_end(); + return -1; + } + + for (i = init_reg_num; i < reg_num; i++) { + r = onig_new(®, pat[i], pat_end[i], options, ENC, + ONIG_SYNTAX_DEFAULT, &einfo); + if (r != 0) { +#ifdef WITH_READ_MAIN + char s[ONIG_MAX_ERROR_MESSAGE_LEN]; + + onig_error_code_to_str((UChar* )s, r, &einfo); + fprintf(stdout, "ERROR: index: %d, %s\n", i, s); +#endif + onig_regset_free(set); + onig_end(); + + if (r == ONIGERR_PARSER_BUG || + r == ONIGERR_STACK_BUG || + r == ONIGERR_UNDEFINED_BYTECODE || + r == ONIGERR_UNEXPECTED_BYTECODE) { + return -2; + } + else + return -1; + } + + r = onig_regset_add(set, reg); + if (r != 0) { + onig_regset_free(set); + onig_end(); + fprintf(stdout, "ERROR: onig_regset_add(): %d\n", i); + return r; + } + } + + REGEX_SUCCESS_COUNT++; + + if (onigenc_is_valid_mbc_string(enc, str, end) != 0) { + VALID_STRING_COUNT++; + r = search(set, lead, str, end); +#ifdef CHECK_EACH_REGEX_SEARCH_TIME + r = check_each_regex_search_time(set, str, end); +#endif + } + + onig_regset_free(set); + onig_end(); + return 0; +} + +#define MAX_PATTERN_SIZE 30 +#define NUM_CONTROL_BYTES 3 + +#define EXEC_PRINT_INTERVAL 2000000 + +static int MaxRegNum; +static int MaxInitRegNum; + +extern int +LLVMFuzzerTestOneInput(const uint8_t * Data, size_t Size) +{ + int r, i; + int pattern_size; + unsigned char *str_null_end; + size_t remaining_size; + unsigned char *data; + unsigned int reg_num; + unsigned int init_reg_num; + unsigned char* pat[256]; + unsigned char* pat_end[256]; + int len; + unsigned int lead_num; + OnigRegSetLead lead; + + INPUT_COUNT++; + + if (Size < NUM_CONTROL_BYTES) return 0; + + remaining_size = Size; + data = (unsigned char* )(Data); + + reg_num = data[0]; + data++; + remaining_size--; + + init_reg_num = data[0]; + data++; + remaining_size--; + + lead_num = data[0]; + data++; + remaining_size--; + lead = (lead_num % 2 == 0 ? ONIG_REGSET_POSITION_LEAD : ONIG_REGSET_REGEX_LEAD); + + if (remaining_size < reg_num * 2) { + reg_num = reg_num % 15; // zero is OK. + } + + init_reg_num %= (reg_num + 1); + + if (MaxRegNum < reg_num) + MaxRegNum = reg_num; + + if (MaxInitRegNum < init_reg_num) + MaxInitRegNum = init_reg_num; + + if (reg_num == 0) + pattern_size = 1; + else + pattern_size = remaining_size / (reg_num * 2); + + if (pattern_size > MAX_PATTERN_SIZE) + pattern_size = MAX_PATTERN_SIZE; + + len = pattern_size * reg_num; + if (len == 0) len = 1; + + for (i = 0; i < reg_num; i++) { + pat[i] = (unsigned char* )malloc(pattern_size); + memcpy(pat[i], data, pattern_size); + pat_end[i] = pat[i] + pattern_size; + data += pattern_size; + remaining_size -= pattern_size; + } + + unsigned char *str = (unsigned char*)malloc(remaining_size != 0 ? remaining_size : 1); + memcpy(str, data, remaining_size); + str_null_end = str + remaining_size; + +#ifdef WITH_READ_MAIN + fprintf(stdout, "reg num: %d, pattern size: %d, lead: %s\n", + reg_num, pattern_size, + lead == ONIG_REGSET_POSITION_LEAD ? "position" : "regex"); + + if (reg_num != 0) { + unsigned char* p; + i = 0; + p = pat[0]; + while (p < pat_end[0]) { + fprintf(stdout, " 0x%02x", (int )*p++); + i++; + if (i % 8 == 0) fprintf(stdout, "\n"); + } + fprintf(stdout, "\n"); + } +#endif + + ENC = ONIG_ENCODING_UTF8; + + r = exec(ENC, reg_num, init_reg_num, pat, pat_end, lead, str, str_null_end); + + for (i = 0; i < reg_num; i++) { + free(pat[i]); + } + free(str); + + if (r == -2) { + //output_data("parser-bug", Data, Size); + exit(-2); + } + + if (EXEC_COUNT_INTERVAL == EXEC_PRINT_INTERVAL) { + char d[64]; + time_t t; + float fexec, freg, fvalid; + + t = time(NULL); + strftime(d, sizeof(d), "%m/%d %H:%M:%S", localtime(&t)); + + fexec = (float )EXEC_COUNT / INPUT_COUNT; + freg = (float )REGEX_SUCCESS_COUNT / INPUT_COUNT; + fvalid = (float )VALID_STRING_COUNT / INPUT_COUNT; + + fprintf(stdout, "%s: %ld: EXEC:%.2f, REG:%.2f, VALID:%.2f MAX REG:%d-%d\n", + d, EXEC_COUNT, fexec, freg, fvalid, MaxRegNum, MaxInitRegNum); + + EXEC_COUNT_INTERVAL = 0; + } + return r; +} + +#ifdef WITH_READ_MAIN + +extern int main(int argc, char* argv[]) +{ + size_t n; + uint8_t Data[10000]; + + n = read(0, Data, sizeof(Data)); + fprintf(stdout, "n: %ld\n", n); + LLVMFuzzerTestOneInput(Data, n); + + return 0; +} +#endif /* WITH_READ_MAIN */ diff --git a/harnesses/syntax-harness.c b/harnesses/syntax-harness.c deleted file mode 100644 index 0fb3587..0000000 --- a/harnesses/syntax-harness.c +++ /dev/null @@ -1,120 +0,0 @@ -/* - * syntax-harness.c - * contributed by Mark Griffin - */ -#include -#include -#include "oniguruma.h" - -#include - -#define DEFAULT_LIMIT 120 -typedef unsigned char uint8_t; - -extern int exec(OnigSyntaxType* syntax, char* apattern, char* astr) -{ - int r; - unsigned char *start, *range, *end; - regex_t* reg; - OnigErrorInfo einfo; - OnigRegion *region; - UChar* pattern = (UChar* )apattern; - UChar* str = (UChar* )astr; - - r = onig_new(®, pattern, pattern + strlen((char* )pattern), - ONIG_OPTION_DEFAULT, ONIG_ENCODING_ASCII, syntax, &einfo); - if (r != ONIG_NORMAL) { - char s[ONIG_MAX_ERROR_MESSAGE_LEN]; - onig_error_code_to_str((UChar* )s, r, &einfo); - fprintf(stdout, "ERROR: %s\n", s); - return -1; - } - - region = onig_region_new(); - - end = str + strlen((char* )str); - start = str; - range = end; - r = onig_search(reg, str, end, start, range, region, ONIG_OPTION_NONE); - if (r >= 0) { - int i; - - fprintf(stdout, "match at %d\n", r); - for (i = 0; i < region->num_regs; i++) { - fprintf(stdout, "%d: (%d-%d)\n", i, region->beg[i], region->end[i]); - } - } - else if (r == ONIG_MISMATCH) { - fprintf(stdout, "search fail\n"); - } - else { /* error */ - char s[ONIG_MAX_ERROR_MESSAGE_LEN]; - onig_error_code_to_str((UChar* )s, r); - fprintf(stdout, "ERROR: %s\n", s); - onig_region_free(region, 1 /* 1:free self, 0:free contents only */); - onig_free(reg); - return -1; - } - - onig_region_free(region, 1 /* 1:free self, 0:free contents only */); - onig_free(reg); - return 0; -} - -#define PATTERN_SIZE 64 -#define NUM_CONTROL_BYTES 1 -#define MIN_STR_SIZE 1 -int LLVMFuzzerTestOneInput(const uint8_t * Data, size_t Size) -{ - if (Size <= (NUM_CONTROL_BYTES + PATTERN_SIZE + MIN_STR_SIZE)) - return 0; - if (Size > 0x1000) - return 0; - size_t remaining_size = Size; - unsigned char *data = (unsigned char *)(Data); - - // pull off one byte to switch syntax choice - unsigned char syntax_choice = data[0]; - data++; - remaining_size--; - - // copy first PATTERN_SIZE bytes off to be the pattern - unsigned char *pattern = (unsigned char *)malloc(PATTERN_SIZE+1); - memset(pattern, 0, PATTERN_SIZE+1); - memcpy(pattern, data, PATTERN_SIZE); - data += PATTERN_SIZE; - remaining_size -= PATTERN_SIZE; - - unsigned char *str = (unsigned char*)malloc(remaining_size+1); - memset(str, 0, remaining_size+1); - memcpy(str, data, remaining_size); - - OnigEncoding use_encs[] = { ONIG_ENCODING_ASCII }; - onig_initialize(use_encs, sizeof(use_encs)/sizeof(use_encs[0])); - - onig_set_retry_limit_in_match(DEFAULT_LIMIT); - onig_set_parse_depth_limit(DEFAULT_LIMIT); - - OnigSyntaxType *syntaxes[] = { - ONIG_SYNTAX_POSIX_EXTENDED, - ONIG_SYNTAX_EMACS, - ONIG_SYNTAX_GREP, - ONIG_SYNTAX_GNU_REGEX, - ONIG_SYNTAX_JAVA, - ONIG_SYNTAX_PERL_NG, - ONIG_SYNTAX_RUBY, - ONIG_SYNTAX_ONIGURUMA, - }; - OnigSyntaxType *syntax = syntaxes[syntax_choice % 8]; - - int r; - r = exec(syntax, (char *)pattern, (char *)str); - // r = exec(ONIG_SYNTAX_JAVA, "\\p{XDigit}\\P{XDigit}[a-c&&b-g]", "bgc"); - - onig_end(); - - free(pattern); - free(str); - - return 0; -} diff --git a/sample/Makefile.am b/sample/Makefile.am index 320afcf..22a4989 100644 --- a/sample/Makefile.am +++ b/sample/Makefile.am @@ -6,7 +6,11 @@ LDADD = $(lib_onig) AM_LDFLAGS = -L$(prefix)/lib AM_CPPFLAGS = -I$(top_srcdir)/src -TESTS = encode listcap names posix simple sql syntax user_property callout echo count bug_fix +if ENABLE_POSIX_API +TESTS = encode listcap names posix simple sql syntax user_property callout echo count bug_fix regset +else +TESTS = encode listcap names simple sql syntax user_property callout echo count bug_fix regset +endif check_PROGRAMS = $(TESTS) @@ -22,6 +26,7 @@ callout_SOURCES = callout.c echo_SOURCES = echo.c count_SOURCES = count.c bug_fix = bug_fix.c +regset_SOURCES = regset.c sampledir = . @@ -29,7 +34,9 @@ test: $(TESTS) $(sampledir)/encode $(sampledir)/listcap $(sampledir)/names +if ENABLE_POSIX_API $(sampledir)/posix +endif $(sampledir)/simple $(sampledir)/sql $(sampledir)/syntax @@ -38,3 +45,4 @@ test: $(TESTS) $(sampledir)/echo $(sampledir)/count $(sampledir)/bug_fix + $(sampledir)/regset diff --git a/sample/bug_fix.c b/sample/bug_fix.c index 3f60c5b..f295bfd 100644 --- a/sample/bug_fix.c +++ b/sample/bug_fix.c @@ -81,7 +81,7 @@ extern int main(int argc, char* argv[]) /* fix ignore case in look-behind commit: 3340ec2cc5627172665303fe248c9793354d2251 */ exec(ONIG_ENCODING_UTF8, ONIG_OPTION_IGNORECASE, - "(?<=\305\211)a", "\312\274na"); /* \u{0149}a \u{02bc}na */ + "\305\211a", "\312\274na"); /* \u{0149}a \u{02bc}na */ exec(ONIG_ENCODING_UTF8, ONIG_OPTION_NONE, "(\\2)(\\1)", "aa"); /* fail. */ diff --git a/sample/regset.c b/sample/regset.c new file mode 100644 index 0000000..ca3a10c --- /dev/null +++ b/sample/regset.c @@ -0,0 +1,94 @@ +/* + * regset.c + */ +#include +#include +#include "oniguruma.h" + +extern int main(int argc, char* argv[]) +{ + int r; + int i, n; + int match_pos; + unsigned char *start, *range, *end; + OnigRegSet* set; + OnigRegSetLead lead; + regex_t* reg; + OnigErrorInfo einfo; + char ebuf[ONIG_MAX_ERROR_MESSAGE_LEN]; + + static UChar* str = (UChar* )"aaaaaaaaaaaaaaaaaaaaaaca"; + + static char* pat[] = { + "a(.*)b|a(.)c", + "^(abc)", + "a(.....)c" + }; + + OnigEncoding use_encs[] = { ONIG_ENCODING_UTF8 }; + onig_initialize(use_encs, sizeof(use_encs)/sizeof(use_encs[0])); + + r = onig_regset_new(&set, 0, NULL); + if (r != ONIG_NORMAL) { + onig_error_code_to_str((UChar* )ebuf, r); + fprintf(stderr, "ERROR: %s\n", ebuf); + onig_end(); + return -1; + } + + n = sizeof(pat) / sizeof(pat[0]); + + for (i = 0; i < n; i++) { + r = onig_new(®, (UChar* )pat[i], (UChar* )(pat[i] + strlen(pat[i])), + ONIG_OPTION_DEFAULT, ONIG_ENCODING_UTF8, ONIG_SYNTAX_DEFAULT, + &einfo); + if (r != ONIG_NORMAL) { + onig_error_code_to_str((UChar* )ebuf, r, &einfo); + fprintf(stderr, "ERROR: %s\n", ebuf); + onig_regset_free(set); + onig_end(); + return -1; + } + + r = onig_regset_add(set, reg); + if (r != ONIG_NORMAL) { + onig_free(reg); + onig_regset_free(set); + onig_end(); + return -1; + } + } + + end = str + strlen((char* )str); + start = str; + range = end; + lead = ONIG_REGSET_POSITION_LEAD; + //lead = ONIG_REGSET_PRIORITY_TO_REGEX_ORDER; + r = onig_regset_search(set, str, end, start, range, lead, ONIG_OPTION_NONE, + &match_pos); + if (r >= 0) { + OnigRegion *region; + + fprintf(stderr, "match regex index: %d\n", r); + fprintf(stderr, "match position: %d\n", match_pos); + + region = onig_regset_get_region(set, r); + for (i = 0; i < region->num_regs; i++) { + fprintf(stderr, "%d: (%d-%d)\n", i, region->beg[i], region->end[i]); + } + } + else if (r == ONIG_MISMATCH) { + fprintf(stderr, "search fail\n"); + } + else { /* error */ + onig_error_code_to_str((UChar* )ebuf, r); + fprintf(stderr, "ERROR: %s\n", ebuf); + onig_regset_free(set); + onig_end(); + return -1; + } + + onig_regset_free(set); + onig_end(); + return 0; +} diff --git a/src/Makefile.windows b/src/Makefile.windows index 762cf07..1e87504 100644 --- a/src/Makefile.windows +++ b/src/Makefile.windows @@ -2,6 +2,9 @@ product_name = oniguruma +TEST_DIR = $(ONIG_DIR)/../test +WIN_DIR = $(ONIG_DIR)/../windows + CPPFLAGS = CFLAGS = -O2 -nologo /W3 LDFLAGS = @@ -152,25 +155,24 @@ $(BUILD_DIR)/unicode_fold1_key.obj: $(ONIG_DIR)/unicode_fold1_key.c $(ONIG_DIR)/ $(BUILD_DIR)/unicode_fold2_key.obj: $(ONIG_DIR)/unicode_fold2_key.c $(ONIG_DIR)/regenc.h $(BUILD_DIR)/config.h $(BUILD_DIR)/unicode_fold3_key.obj: $(ONIG_DIR)/unicode_fold3_key.c $(ONIG_DIR)/regenc.h $(BUILD_DIR)/config.h -# C library test -ctest: $(testc) - .\$(testc) -# POSIX C library test -ptest: $(testp) - .\$(testp) +test_regset: $(TEST_DIR)/test_regset.c $(libname) + $(CC) -nologo /Fe:$@ /I. /I$(ONIG_DIR) /DONIG_EXTERN=extern /utf-8 $(TEST_DIR)/test_regset.c $(libname) + +test_utf8: $(TEST_DIR)/test_utf8.c $(libname) + $(CC) -nologo /Fe:$@ /I. /I$(ONIG_DIR) /DONIG_EXTERN=extern /utf-8 $(TEST_DIR)/test_utf8.c $(libname) -$(testc): $(testc).c $(libname) - $(CC) -nologo /Fe:$(testc) -DONIG_EXTERN=extern $(testc).c $(libname) +testc: $(WIN_DIR)/testc.c $(libname) + $(CC) -nologo /Fe:$@ /I. /I$(ONIG_DIR) /DONIG_EXTERN=extern $(WIN_DIR)/testc.c $(libname) -$(testp): $(testc).c $(dlllib) - $(CC) -nologo -DPOSIX_TEST /Fe:$(testp) $(testc).c $(dlllib) +testp: $(WIN_DIR)/testc.c $(libname) + $(CC) -nologo /Fe:$@ /I. /I$(ONIG_DIR) /DONIG_EXTERN=extern /DPOSIX_TEST $(WIN_DIR)/testc.c $(libname) -$(testc)u: $(testc)u.c $(libname) - $(CC) -nologo /Fe:$(testc)u -DONIG_EXTERN=extern $(testc)u.c $(libname) +testu: $(TEST_DIR)/testu.c $(libname) + $(CC) -nologo /Fe:$@ /I. /I$(ONIG_DIR) /DONIG_EXTERN=extern $(TEST_DIR)/testu.c $(libname) clean: - del $(BUILD_DIR)\*.obj $(BUILD_DIR)\*.lib $(BUILD_DIR)\*.exp $(BUILD_DIR)\*.dll $(BUILD_DIR)\$(testp).exe $(BUILD_DIR)\$(testc).exe $(BUILD_DIR)\$(testc).obj + del $(BUILD_DIR)\*.obj $(BUILD_DIR)\*.lib $(BUILD_DIR)\*.exp $(BUILD_DIR)\*.dll $(BUILD_DIR)\test_regset.exe $(BUILD_DIR)\test_utf8.exe $(BUILD_DIR)\testp.exe $(BUILD_DIR)\testc.exe $(BUILD_DIR)\testu.exe samples: all diff --git a/src/ascii.c b/src/ascii.c index e83e4d6..f2dc0d3 100644 --- a/src/ascii.c +++ b/src/ascii.c @@ -2,7 +2,7 @@ ascii.c - Oniguruma (regular expression library) **********************************************************************/ /*- - * Copyright (c) 2002-2018 K.Kosako + * Copyright (c) 2002-2019 K.Kosako * All rights reserved. * * Redistribution and use in source and binary forms, with or without diff --git a/src/big5.c b/src/big5.c index ff8bd3b..79ae1e3 100644 --- a/src/big5.c +++ b/src/big5.c @@ -2,7 +2,7 @@ big5.c - Oniguruma (regular expression library) **********************************************************************/ /*- - * Copyright (c) 2002-2018 K.Kosako + * Copyright (c) 2002-2019 K.Kosako * All rights reserved. * * Redistribution and use in source and binary forms, with or without @@ -54,6 +54,16 @@ big5_mbc_enc_len(const UChar* p) return EncLen_BIG5[*p]; } +static int +big5_code_to_mbclen(OnigCodePoint code) +{ + if ((code & (~0xffff)) != 0) return ONIGERR_INVALID_CODE_POINT_VALUE; + if ((code & 0xff00) != 0) return 2; + if (EncLen_BIG5[(int )(code & 0xff)] == 1) return 1; + + return ONIGERR_INVALID_CODE_POINT_VALUE; +} + static int is_valid_mbc_string(const UChar* p, const UChar* end) { @@ -99,15 +109,6 @@ big5_mbc_case_fold(OnigCaseFoldType flag, const UChar** pp, const UChar* end, pp, end, lower); } -#if 0 -static int -big5_is_mbc_ambiguous(OnigCaseFoldType flag, - const UChar** pp, const UChar* end) -{ - return onigenc_mbn_is_mbc_ambiguous(ONIG_ENCODING_BIG5, flag, pp, end); -} -#endif - static int big5_is_code_ctype(OnigCodePoint code, unsigned int ctype) { @@ -174,7 +175,7 @@ OnigEncodingType OnigEncodingBIG5 = { 1, /* min enc length */ onigenc_is_mbc_newline_0x0a, big5_mbc_to_code, - onigenc_mb2_code_to_mbclen, + big5_code_to_mbclen, big5_code_to_mbc, big5_mbc_case_fold, onigenc_ascii_apply_all_case_fold, diff --git a/src/config.h.win32 b/src/config.h.win32 index 1f848e2..82a35b9 100644 --- a/src/config.h.win32 +++ b/src/config.h.win32 @@ -1,3 +1,9 @@ +#if defined(__MINGW32__) || _MSC_VER >= 1600 +#define HAVE_STDINT_H 1 +#endif +#if defined(__MINGW32__) || _MSC_VER >= 1800 +#define HAVE_INTTYPES_H 1 +#endif #define HAVE_SYS_TYPES_H 1 #define HAVE_SYS_STAT_H 1 #define HAVE_MEMORY_H 1 diff --git a/src/config.h.win64 b/src/config.h.win64 index f72671b..7f19699 100644 --- a/src/config.h.win64 +++ b/src/config.h.win64 @@ -1,3 +1,9 @@ +#if defined(__MINGW32__) || _MSC_VER >= 1600 +#define HAVE_STDINT_H 1 +#endif +#if defined(__MINGW32__) || _MSC_VER >= 1800 +#define HAVE_INTTYPES_H 1 +#endif #define HAVE_SYS_TYPES_H 1 #define HAVE_SYS_STAT_H 1 #define HAVE_MEMORY_H 1 diff --git a/src/config.h.windows.in b/src/config.h.windows.in index d8de1dd..d4f73d7 100644 --- a/src/config.h.windows.in +++ b/src/config.h.windows.in @@ -1,7 +1,14 @@ +#if defined(__MINGW32__) || _MSC_VER >= 1600 +#define HAVE_STDINT_H 1 +#endif +#if defined(__MINGW32__) || _MSC_VER >= 1800 +#define HAVE_INTTYPES_H 1 +#endif #define HAVE_SYS_TYPES_H 1 #define HAVE_SYS_STAT_H 1 #define HAVE_MEMORY_H 1 #define HAVE_OFF_T 1 + #define SIZEOF_INT 4 #define SIZEOF_LONG 4 #define SIZEOF_LONG_LONG 8 diff --git a/src/cp1251.c b/src/cp1251.c index b4ce4d8..fa20780 100644 --- a/src/cp1251.c +++ b/src/cp1251.c @@ -2,8 +2,8 @@ cp1251.c - Oniguruma (regular expression library) **********************************************************************/ /*- - * Copyright (c) 2006-2018 Byte - * K.Kosako + * Copyright (c) 2006-2019 Byte + * K.Kosako * All rights reserved. * * Redistribution and use in source and binary forms, with or without diff --git a/src/euc_jp.c b/src/euc_jp.c index d17386d..640b3e3 100644 --- a/src/euc_jp.c +++ b/src/euc_jp.c @@ -2,7 +2,7 @@ euc_jp.c - Oniguruma (regular expression library) **********************************************************************/ /*- - * Copyright (c) 2002-2018 K.Kosako + * Copyright (c) 2002-2019 K.Kosako * All rights reserved. * * Redistribution and use in source and binary forms, with or without @@ -120,25 +120,6 @@ code_to_mbclen(OnigCodePoint code) return ONIGERR_INVALID_CODE_POINT_VALUE; } -#if 0 -static int -code_to_mbc_first(OnigCodePoint code) -{ - int first; - - if ((code & 0xff0000) != 0) { - first = (code >> 16) & 0xff; - } - else if ((code & 0xff00) != 0) { - first = (code >> 8) & 0xff; - } - else { - return (int )code; - } - return first; -} -#endif - static int code_to_mbc(OnigCodePoint code, UChar *buf) { diff --git a/src/euc_jp_prop.c b/src/euc_jp_prop.c index be719cf..a816f48 100644 --- a/src/euc_jp_prop.c +++ b/src/euc_jp_prop.c @@ -1,5 +1,5 @@ /* ANSI-C code produced by gperf version 3.1 */ -/* Command-line: /usr/local/bin/gperf -pt -T -L ANSI-C -N onigenc_euc_jp_lookup_property_name --output-file gperf1.tmp euc_jp_prop.gperf */ +/* Command-line: gperf -pt -T -L ANSI-C -N onigenc_euc_jp_lookup_property_name --output-file gperf1.tmp euc_jp_prop.gperf */ /* Computed positions: -k'1,3' */ #if !((' ' == 32) && ('!' == 33) && ('"' == 34) && ('#' == 35) \ diff --git a/src/euc_kr.c b/src/euc_kr.c index bb968b0..7fa50af 100644 --- a/src/euc_kr.c +++ b/src/euc_kr.c @@ -2,7 +2,7 @@ euc_kr.c - Oniguruma (regular expression library) **********************************************************************/ /*- - * Copyright (c) 2002-2018 K.Kosako + * Copyright (c) 2002-2019 K.Kosako * All rights reserved. * * Redistribution and use in source and binary forms, with or without @@ -54,6 +54,16 @@ euckr_mbc_enc_len(const UChar* p) return EncLen_EUCKR[*p]; } +static int +euckr_code_to_mbclen(OnigCodePoint code) +{ + if ((code & (~0xffff)) != 0) return ONIGERR_INVALID_CODE_POINT_VALUE; + if ((code & 0xff00) != 0) return 2; + if (EncLen_EUCKR[(int )(code & 0xff)] == 1) return 1; + + return ONIGERR_INVALID_CODE_POINT_VALUE; +} + static int is_valid_mbc_string(const UChar* p, const UChar* end) { @@ -98,15 +108,6 @@ euckr_mbc_case_fold(OnigCaseFoldType flag, const UChar** pp, const UChar* end, pp, end, lower); } -#if 0 -static int -euckr_is_mbc_ambiguous(OnigCaseFoldType flag, - const UChar** pp, const UChar* end) -{ - return onigenc_mbn_is_mbc_ambiguous(ONIG_ENCODING_EUC_KR, flag, pp, end); -} -#endif - static int euckr_is_code_ctype(OnigCodePoint code, unsigned int ctype) { @@ -149,7 +150,7 @@ OnigEncodingType OnigEncodingEUC_KR = { 1, /* min enc length */ onigenc_is_mbc_newline_0x0a, euckr_mbc_to_code, - onigenc_mb2_code_to_mbclen, + euckr_code_to_mbclen, euckr_code_to_mbc, euckr_mbc_case_fold, onigenc_ascii_apply_all_case_fold, @@ -174,7 +175,7 @@ OnigEncodingType OnigEncodingEUC_CN = { 1, /* min enc length */ onigenc_is_mbc_newline_0x0a, euckr_mbc_to_code, - onigenc_mb2_code_to_mbclen, + euckr_code_to_mbclen, euckr_code_to_mbc, euckr_mbc_case_fold, onigenc_ascii_apply_all_case_fold, diff --git a/src/euc_tw.c b/src/euc_tw.c index c9acaf1..8e72b97 100644 --- a/src/euc_tw.c +++ b/src/euc_tw.c @@ -2,7 +2,7 @@ euc_tw.c - Oniguruma (regular expression library) **********************************************************************/ /*- - * Copyright (c) 2002-2018 K.Kosako + * Copyright (c) 2002-2019 K.Kosako * All rights reserved. * * Redistribution and use in source and binary forms, with or without @@ -54,6 +54,20 @@ euctw_mbc_enc_len(const UChar* p) return EncLen_EUCTW[*p]; } +static int +euctw_code_to_mbclen(OnigCodePoint code) +{ + if ((code & 0xff000000) != 0) return 4; + else if ((code & 0xff0000) != 0) return ONIGERR_INVALID_CODE_POINT_VALUE; + else if ((code & 0xff00) != 0) return 2; + else { + if (EncLen_EUCTW[(int )(code & 0xff)] == 1) + return 1; + + return ONIGERR_INVALID_CODE_POINT_VALUE; + } +} + static int is_valid_mbc_string(const UChar* p, const UChar* end) { @@ -155,7 +169,7 @@ OnigEncodingType OnigEncodingEUC_TW = { 1, /* min enc length */ onigenc_is_mbc_newline_0x0a, euctw_mbc_to_code, - onigenc_mb4_code_to_mbclen, + euctw_code_to_mbclen, euctw_code_to_mbc, euctw_mbc_case_fold, onigenc_ascii_apply_all_case_fold, diff --git a/src/gb18030.c b/src/gb18030.c index 8d415b0..50898eb 100644 --- a/src/gb18030.c +++ b/src/gb18030.c @@ -3,7 +3,7 @@ **********************************************************************/ /*- * Copyright (c) 2005-2019 KUBO Takehiro - * K.Kosako + * K.Kosako * All rights reserved. * * Redistribution and use in source and binary forms, with or without @@ -33,6 +33,7 @@ #if 1 #define DEBUG_GB18030(arg) #else +#include #define DEBUG_GB18030(arg) printf arg #endif @@ -75,6 +76,20 @@ gb18030_mbc_enc_len(const UChar* p) return 2; } +static int +gb18030_code_to_mbclen(OnigCodePoint code) +{ + if ((code & 0xff000000) != 0) return 4; + else if ((code & 0xff0000) != 0) return ONIGERR_INVALID_CODE_POINT_VALUE; + else if ((code & 0xff00) != 0) return 2; + else { + if (GB18030_MAP[(int )(code & 0xff)] == CM) + return ONIGERR_INVALID_CODE_POINT_VALUE; + + return 1; + } +} + static int is_valid_mbc_string(const UChar* p, const UChar* end) { @@ -135,15 +150,6 @@ gb18030_mbc_case_fold(OnigCaseFoldType flag, const UChar** pp, const UChar* end, pp, end, lower); } -#if 0 -static int -gb18030_is_mbc_ambiguous(OnigCaseFoldType flag, - const UChar** pp, const UChar* end) -{ - return onigenc_mbn_is_mbc_ambiguous(ONIG_ENCODING_GB18030, flag, pp, end); -} -#endif - static int gb18030_is_code_ctype(OnigCodePoint code, unsigned int ctype) { @@ -522,7 +528,7 @@ OnigEncodingType OnigEncodingGB18030 = { 1, /* min enc length */ onigenc_is_mbc_newline_0x0a, gb18030_mbc_to_code, - onigenc_mb4_code_to_mbclen, + gb18030_code_to_mbclen, gb18030_code_to_mbc, gb18030_mbc_case_fold, onigenc_ascii_apply_all_case_fold, diff --git a/src/gperf_fold_key_conv.py b/src/gperf_fold_key_conv.py index f453186..c633100 100755 --- a/src/gperf_fold_key_conv.py +++ b/src/gperf_fold_key_conv.py @@ -12,7 +12,7 @@ REG_STR_AT = re.compile('str\[(\d+)\]') REG_RETURN_TYPE = re.compile('^const\s+short\s+int\s*\*') REG_FOLD_KEY = re.compile('unicode_fold(\d)_key\s*\(register\s+const\s+char\s*\*\s*str,\s*register\s+size_t\s+len\)') REG_ENTRY = re.compile('\{".*?",\s*(-?\d+)\s*\}') -REG_IF_LEN = re.compile('if\s*\(\s*len\s*<=\s*MAX_WORD_LENGTH.+') +REG_IF_LEN = re.compile('\s*if\s*\(\s*len\s*<=\s*MAX_WORD_LENGTH.+') REG_GET_HASH = re.compile('(?:register\s+)?(?:unsigned\s+)?int\s+key\s*=\s*hash\s*\(str,\s*len\);') REG_GET_CODE = re.compile('(?:register\s+)?const\s+char\s*\*\s*s\s*=\s*wordlist\[key\]\.name;') REG_CODE_CHECK = re.compile('if\s*\(\*str\s*==\s*\*s\s*&&\s*!strncmp.+\)') @@ -34,7 +34,7 @@ def parse_line(s, key_len): if r != s: return r r = re.sub(REG_ENTRY, '\\1', s) if r != s: return r - r = re.sub(REG_IF_LEN, 'if (0 == 0)', s) + r = re.sub(REG_IF_LEN, '', s) if r != s: return r r = re.sub(REG_GET_HASH, 'int key = hash(codes);', s) if r != s: return r diff --git a/src/gperf_unfold_key_conv.py b/src/gperf_unfold_key_conv.py index 3cf4836..d999d4e 100755 --- a/src/gperf_unfold_key_conv.py +++ b/src/gperf_unfold_key_conv.py @@ -12,7 +12,7 @@ REG_STR_AT = re.compile('str\[(\d+)\]') REG_UNFOLD_KEY = re.compile('onigenc_unicode_unfold_key\s*\(register\s+const\s+char\s*\*\s*str,\s*register\s+size_t\s+len\)') REG_ENTRY = re.compile('\{".+?",\s*/\*(.+?)\*/\s*(-?\d+),\s*(\d)\}') REG_EMPTY_ENTRY = re.compile('\{"",\s*(-?\d+),\s*(\d)\}') -REG_IF_LEN = re.compile('if\s*\(\s*len\s*<=\s*MAX_WORD_LENGTH.+') +REG_IF_LEN = re.compile('\s*if\s*\(\s*len\s*<=\s*MAX_WORD_LENGTH.+') REG_GET_HASH = re.compile('(?:register\s+)?(?:unsigned\s+)?int\s+key\s*=\s*hash\s*\(str,\s*len\);') REG_GET_CODE = re.compile('(?:register\s+)?const\s+char\s*\*\s*s\s*=\s*wordlist\[key\]\.name;') REG_CODE_CHECK = re.compile('if\s*\(\*str\s*==\s*\*s\s*&&\s*!strncmp.+\)') @@ -32,7 +32,7 @@ def parse_line(s): if r != s: return r r = re.sub(REG_EMPTY_ENTRY, '{0xffffffff, \\1, \\2}', s) if r != s: return r - r = re.sub(REG_IF_LEN, 'if (0 == 0)', s) + r = re.sub(REG_IF_LEN, '', s) if r != s: return r r = re.sub(REG_GET_HASH, 'int key = hash(&code);', s) if r != s: return r diff --git a/src/iso8859_1.c b/src/iso8859_1.c index 3b64942..e681c2a 100644 --- a/src/iso8859_1.c +++ b/src/iso8859_1.c @@ -2,7 +2,7 @@ iso8859_1.c - Oniguruma (regular expression library) **********************************************************************/ /*- - * Copyright (c) 2002-2018 K.Kosako + * Copyright (c) 2002-2019 K.Kosako * All rights reserved. * * Redistribution and use in source and binary forms, with or without @@ -216,32 +216,6 @@ mbc_case_fold(OnigCaseFoldType flag, const UChar** pp, return 1; } -#if 0 -static int -is_mbc_ambiguous(OnigCaseFoldType flag, const UChar** pp, const UChar* end) -{ - int v; - const UChar* p = *pp; - - if (*p == 0xdf && (flag & INTERNAL_ONIGENC_CASE_FOLD_MULTI_CHAR) != 0) { - (*pp)++; - return TRUE; - } - - (*pp)++; - v = (EncISO_8859_1_CtypeTable[*p] & (BIT_CTYPE_UPPER | BIT_CTYPE_LOWER)); - if ((v | BIT_CTYPE_LOWER) != 0) { - /* 0xdf, 0xaa, 0xb5, 0xba are lower case letter, but can't convert. */ - if (*p >= 0xaa && *p <= 0xba) - return FALSE; - else - return TRUE; - } - - return (v != 0 ? TRUE : FALSE); -} -#endif - static int is_code_ctype(OnigCodePoint code, unsigned int ctype) { diff --git a/src/iso8859_10.c b/src/iso8859_10.c index f5882bc..e98cffb 100644 --- a/src/iso8859_10.c +++ b/src/iso8859_10.c @@ -2,7 +2,7 @@ iso8859_10.c - Oniguruma (regular expression library) **********************************************************************/ /*- - * Copyright (c) 2002-2018 K.Kosako + * Copyright (c) 2002-2019 K.Kosako * All rights reserved. * * Redistribution and use in source and binary forms, with or without @@ -121,28 +121,6 @@ mbc_case_fold(OnigCaseFoldType flag, return 1; } -#if 0 -static int -is_mbc_ambiguous(OnigCaseFoldType flag, const UChar** pp, const UChar* end) -{ - int v; - const UChar* p = *pp; - - if (*p == 0xdf && (flag & INTERNAL_ONIGENC_CASE_FOLD_MULTI_CHAR) != 0) { - (*pp)++; - return TRUE; - } - - (*pp)++; - v = (EncISO_8859_10_CtypeTable[*p] & (BIT_CTYPE_UPPER | BIT_CTYPE_LOWER)); - if ((v | BIT_CTYPE_LOWER) != 0) { - return TRUE; - } - - return (v != 0 ? TRUE : FALSE); -} -#endif - static int is_code_ctype(OnigCodePoint code, unsigned int ctype) { diff --git a/src/iso8859_11.c b/src/iso8859_11.c index da8fda0..8639ce2 100644 --- a/src/iso8859_11.c +++ b/src/iso8859_11.c @@ -2,7 +2,7 @@ iso8859_11.c - Oniguruma (regular expression library) **********************************************************************/ /*- - * Copyright (c) 2002-2018 K.Kosako + * Copyright (c) 2002-2019 K.Kosako * All rights reserved. * * Redistribution and use in source and binary forms, with or without diff --git a/src/iso8859_13.c b/src/iso8859_13.c index 0cf251c..2bd460f 100644 --- a/src/iso8859_13.c +++ b/src/iso8859_13.c @@ -2,7 +2,7 @@ iso8859_13.c - Oniguruma (regular expression library) **********************************************************************/ /*- - * Copyright (c) 2002-2018 K.Kosako + * Copyright (c) 2002-2019 K.Kosako * All rights reserved. * * Redistribution and use in source and binary forms, with or without @@ -121,32 +121,6 @@ mbc_case_fold(OnigCaseFoldType flag, return 1; } -#if 0 -static int -is_mbc_ambiguous(OnigCaseFoldType flag, const UChar** pp, const UChar* end) -{ - int v; - const UChar* p = *pp; - - if (*p == 0xdf && (flag & INTERNAL_ONIGENC_CASE_FOLD_MULTI_CHAR) != 0) { - (*pp)++; - return TRUE; - } - - (*pp)++; - v = (EncISO_8859_13_CtypeTable[*p] & (BIT_CTYPE_UPPER | BIT_CTYPE_LOWER)); - if ((v | BIT_CTYPE_LOWER) != 0) { - /* 0xdf, 0xb5 are lower case letter, but can't convert. */ - if (*p == 0xb5) - return FALSE; - else - return TRUE; - } - - return (v != 0 ? TRUE : FALSE); -} -#endif - static int is_code_ctype(OnigCodePoint code, unsigned int ctype) { diff --git a/src/iso8859_14.c b/src/iso8859_14.c index 030e9f5..5030b55 100644 --- a/src/iso8859_14.c +++ b/src/iso8859_14.c @@ -2,7 +2,7 @@ iso8859_14.c - Oniguruma (regular expression library) **********************************************************************/ /*- - * Copyright (c) 2002-2018 K.Kosako + * Copyright (c) 2002-2019 K.Kosako * All rights reserved. * * Redistribution and use in source and binary forms, with or without @@ -121,29 +121,6 @@ mbc_case_fold(OnigCaseFoldType flag, return 1; /* return byte length of converted char to lower */ } -#if 0 -static int -is_mbc_ambiguous(OnigCaseFoldType flag, - const UChar** pp, const UChar* end) -{ - int v; - const UChar* p = *pp; - - if (*p == 0xdf && (flag & INTERNAL_ONIGENC_CASE_FOLD_MULTI_CHAR) != 0) { - (*pp)++; - return TRUE; - } - - (*pp)++; - v = (EncISO_8859_14_CtypeTable[*p] & (BIT_CTYPE_UPPER | BIT_CTYPE_LOWER)); - if ((v | BIT_CTYPE_LOWER) != 0) { - return TRUE; - } - - return (v != 0 ? TRUE : FALSE); -} -#endif - static int is_code_ctype(OnigCodePoint code, unsigned int ctype) { diff --git a/src/iso8859_15.c b/src/iso8859_15.c index 859d727..f32c3de 100644 --- a/src/iso8859_15.c +++ b/src/iso8859_15.c @@ -2,7 +2,7 @@ iso8859_15.c - Oniguruma (regular expression library) **********************************************************************/ /*- - * Copyright (c) 2002-2018 K.Kosako + * Copyright (c) 2002-2019 K.Kosako * All rights reserved. * * Redistribution and use in source and binary forms, with or without @@ -121,32 +121,6 @@ mbc_case_fold(OnigCaseFoldType flag, return 1; /* return byte length of converted char to lower */ } -#if 0 -static int -is_mbc_ambiguous(OnigCaseFoldType flag, const UChar** pp, const UChar* end) -{ - int v; - const UChar* p = *pp; - - if (*p == 0xdf && (flag & INTERNAL_ONIGENC_CASE_FOLD_MULTI_CHAR) != 0) { - (*pp)++; - return TRUE; - } - - (*pp)++; - v = (EncISO_8859_15_CtypeTable[*p] & (BIT_CTYPE_UPPER | BIT_CTYPE_LOWER)); - if ((v | BIT_CTYPE_LOWER) != 0) { - /* 0xdf etc.. are lower case letter, but can't convert. */ - if (*p == 0xaa || *p == 0xb5 || *p == 0xba) - return FALSE; - else - return TRUE; - } - - return (v != 0 ? TRUE : FALSE); -} -#endif - static int is_code_ctype(OnigCodePoint code, unsigned int ctype) { diff --git a/src/iso8859_16.c b/src/iso8859_16.c index 2614e56..22a653a 100644 --- a/src/iso8859_16.c +++ b/src/iso8859_16.c @@ -2,7 +2,7 @@ iso8859_16.c - Oniguruma (regular expression library) **********************************************************************/ /*- - * Copyright (c) 2002-2018 K.Kosako + * Copyright (c) 2002-2019 K.Kosako * All rights reserved. * * Redistribution and use in source and binary forms, with or without @@ -121,28 +121,6 @@ mbc_case_fold(OnigCaseFoldType flag, return 1; /* return byte length of converted char to lower */ } -#if 0 -static int -is_mbc_ambiguous(OnigCaseFoldType flag, const UChar** pp, const UChar* end) -{ - int v; - const UChar* p = *pp; - - if (*p == 0xdf && (flag & INTERNAL_ONIGENC_CASE_FOLD_MULTI_CHAR) != 0) { - (*pp)++; - return TRUE; - } - - (*pp)++; - v = (EncISO_8859_16_CtypeTable[*p] & (BIT_CTYPE_UPPER | BIT_CTYPE_LOWER)); - if ((v | BIT_CTYPE_LOWER) != 0) { - return TRUE; - } - - return (v != 0 ? TRUE : FALSE); -} -#endif - static int is_code_ctype(OnigCodePoint code, unsigned int ctype) { diff --git a/src/iso8859_2.c b/src/iso8859_2.c index ba030d5..dc3d0a1 100644 --- a/src/iso8859_2.c +++ b/src/iso8859_2.c @@ -2,7 +2,7 @@ iso8859_2.c - Oniguruma (regular expression library) **********************************************************************/ /*- - * Copyright (c) 2002-2018 K.Kosako + * Copyright (c) 2002-2019 K.Kosako * All rights reserved. * * Redistribution and use in source and binary forms, with or without @@ -121,28 +121,6 @@ mbc_case_fold(OnigCaseFoldType flag, return 1; /* return byte length of converted char to lower */ } -#if 0 -static int -is_mbc_ambiguous(OnigCaseFoldType flag, const UChar** pp, const UChar* end) -{ - int v; - const UChar* p = *pp; - - if (*p == 0xdf && (flag & INTERNAL_ONIGENC_CASE_FOLD_MULTI_CHAR) != 0) { - (*pp)++; - return TRUE; - } - - (*pp)++; - v = (EncISO_8859_2_CtypeTable[*p] & (BIT_CTYPE_UPPER | BIT_CTYPE_LOWER)); - if ((v | BIT_CTYPE_LOWER) != 0) { - return TRUE; - } - - return (v != 0 ? TRUE : FALSE); -} -#endif - static const OnigPairCaseFoldCodes CaseFoldMap[] = { { 0xa1, 0xb1 }, { 0xa3, 0xb3 }, diff --git a/src/iso8859_3.c b/src/iso8859_3.c index f090d0b..49dc6b2 100644 --- a/src/iso8859_3.c +++ b/src/iso8859_3.c @@ -2,7 +2,7 @@ iso8859_3.c - Oniguruma (regular expression library) **********************************************************************/ /*- - * Copyright (c) 2002-2018 K.Kosako + * Copyright (c) 2002-2019 K.Kosako * All rights reserved. * * Redistribution and use in source and binary forms, with or without @@ -121,32 +121,6 @@ mbc_case_fold(OnigCaseFoldType flag, const UChar** pp, return 1; } -#if 0 -static int -is_mbc_ambiguous(OnigCaseFoldType flag, const UChar** pp, const UChar* end) -{ - int v; - const UChar* p = *pp; - - if (*p == 0xdf && (flag & INTERNAL_ONIGENC_CASE_FOLD_MULTI_CHAR) != 0) { - (*pp)++; - return TRUE; - } - - (*pp)++; - v = (EncISO_8859_3_CtypeTable[*p] & (BIT_CTYPE_UPPER | BIT_CTYPE_LOWER)); - if ((v | BIT_CTYPE_LOWER) != 0) { - /* 0xaa, 0xb5, 0xba are lower case letter, but can't convert. */ - if (*p == 0xb5) - return FALSE; - else - return TRUE; - } - - return (v != 0 ? TRUE : FALSE); -} -#endif - static int is_code_ctype(OnigCodePoint code, unsigned int ctype) { diff --git a/src/iso8859_4.c b/src/iso8859_4.c index 57dc9fe..f3f6ba9 100644 --- a/src/iso8859_4.c +++ b/src/iso8859_4.c @@ -2,7 +2,7 @@ iso8859_4.c - Oniguruma (regular expression library) **********************************************************************/ /*- - * Copyright (c) 2002-2018 K.Kosako + * Copyright (c) 2002-2019 K.Kosako * All rights reserved. * * Redistribution and use in source and binary forms, with or without @@ -121,31 +121,6 @@ mbc_case_fold(OnigCaseFoldType flag, return 1; /* return byte length of converted char to lower */ } -#if 0 -static int -is_mbc_ambiguous(OnigCaseFoldType flag, const UChar** pp, const UChar* end) -{ - int v; - const UChar* p = *pp; - - if (*p == 0xdf && (flag & INTERNAL_ONIGENC_CASE_FOLD_MULTI_CHAR) != 0) { - (*pp)++; - return TRUE; - } - - (*pp)++; - v = (EncISO_8859_4_CtypeTable[*p] & (BIT_CTYPE_UPPER | BIT_CTYPE_LOWER)); - if ((v | BIT_CTYPE_LOWER) != 0) { - if (*p == 0xa2) - return FALSE; - else - return TRUE; - } - - return (v != 0 ? TRUE : FALSE); -} -#endif - static int is_code_ctype(OnigCodePoint code, unsigned int ctype) { diff --git a/src/iso8859_5.c b/src/iso8859_5.c index a090d25..a5f587c 100644 --- a/src/iso8859_5.c +++ b/src/iso8859_5.c @@ -2,7 +2,7 @@ iso8859_5.c - Oniguruma (regular expression library) **********************************************************************/ /*- - * Copyright (c) 2002-2018 K.Kosako + * Copyright (c) 2002-2019 K.Kosako * All rights reserved. * * Redistribution and use in source and binary forms, with or without @@ -114,19 +114,6 @@ mbc_case_fold(OnigCaseFoldType flag ARG_UNUSED, return 1; } -#if 0 -static int -is_mbc_ambiguous(OnigCaseFoldType flag, const UChar** pp, const UChar* end) -{ - int v; - const UChar* p = *pp; - - (*pp)++; - v = (EncISO_8859_5_CtypeTable[*p] & (BIT_CTYPE_UPPER | BIT_CTYPE_LOWER)); - return (v != 0 ? TRUE : FALSE); -} -#endif - static int is_code_ctype(OnigCodePoint code, unsigned int ctype) { diff --git a/src/iso8859_6.c b/src/iso8859_6.c index 1c16c79..fb72442 100644 --- a/src/iso8859_6.c +++ b/src/iso8859_6.c @@ -2,7 +2,7 @@ iso8859_6.c - Oniguruma (regular expression library) **********************************************************************/ /*- - * Copyright (c) 2002-2018 K.Kosako + * Copyright (c) 2002-2019 K.Kosako * All rights reserved. * * Redistribution and use in source and binary forms, with or without diff --git a/src/iso8859_7.c b/src/iso8859_7.c index 8c88351..018efac 100644 --- a/src/iso8859_7.c +++ b/src/iso8859_7.c @@ -2,7 +2,7 @@ iso8859_7.c - Oniguruma (regular expression library) **********************************************************************/ /*- - * Copyright (c) 2002-2018 K.Kosako + * Copyright (c) 2002-2019 K.Kosako * All rights reserved. * * Redistribution and use in source and binary forms, with or without @@ -114,26 +114,6 @@ mbc_case_fold(OnigCaseFoldType flag ARG_UNUSED, return 1; } -#if 0 -static int -is_mbc_ambiguous(OnigCaseFoldType flag, const UChar** pp, const UChar* end) -{ - int v; - const UChar* p = *pp; - - (*pp)++; - v = (EncISO_8859_7_CtypeTable[*p] & (BIT_CTYPE_UPPER | BIT_CTYPE_LOWER)); - if ((v | BIT_CTYPE_LOWER) != 0) { - if (*p == 0xc0 || *p == 0xe0) - return FALSE; - else - return TRUE; - } - - return (v != 0 ? TRUE : FALSE); -} -#endif - static int is_code_ctype(OnigCodePoint code, unsigned int ctype) { diff --git a/src/iso8859_8.c b/src/iso8859_8.c index bd3e94d..92a5eb1 100644 --- a/src/iso8859_8.c +++ b/src/iso8859_8.c @@ -2,7 +2,7 @@ iso8859_8.c - Oniguruma (regular expression library) **********************************************************************/ /*- - * Copyright (c) 2002-2018 K.Kosako + * Copyright (c) 2002-2019 K.Kosako * All rights reserved. * * Redistribution and use in source and binary forms, with or without diff --git a/src/iso8859_9.c b/src/iso8859_9.c index 1d291d5..1f9bdea 100644 --- a/src/iso8859_9.c +++ b/src/iso8859_9.c @@ -2,7 +2,7 @@ iso8859_9.c - Oniguruma (regular expression library) **********************************************************************/ /*- - * Copyright (c) 2002-2018 K.Kosako + * Copyright (c) 2002-2019 K.Kosako * All rights reserved. * * Redistribution and use in source and binary forms, with or without @@ -121,32 +121,6 @@ mbc_case_fold(OnigCaseFoldType flag, return 1; } -#if 0 -static int -is_mbc_ambiguous(OnigCaseFoldType flag, const UChar** pp, const UChar* end) -{ - int v; - const UChar* p = *pp; - - if (*p == 0xdf && (flag & INTERNAL_ONIGENC_CASE_FOLD_MULTI_CHAR) != 0) { - (*pp)++; - return TRUE; - } - - (*pp)++; - v = (EncISO_8859_9_CtypeTable[*p] & (BIT_CTYPE_UPPER | BIT_CTYPE_LOWER)); - if ((v | BIT_CTYPE_LOWER) != 0) { - /* 0xdf etc.. are lower case letter, but can't convert. */ - if (*p >= 0xaa && *p <= 0xba) - return FALSE; - else - return TRUE; - } - - return (v != 0 ? TRUE : FALSE); -} -#endif - static int is_code_ctype(OnigCodePoint code, unsigned int ctype) { diff --git a/src/koi8.c b/src/koi8.c index 94c95a0..37023c6 100644 --- a/src/koi8.c +++ b/src/koi8.c @@ -2,7 +2,7 @@ koi8.c - Oniguruma (regular expression library) **********************************************************************/ /*- - * Copyright (c) 2002-2018 K.Kosako + * Copyright (c) 2002-2019 K.Kosako * All rights reserved. * * Redistribution and use in source and binary forms, with or without @@ -115,25 +115,6 @@ koi8_mbc_case_fold(OnigCaseFoldType flag ARG_UNUSED, return 1; } -#if 0 -static int -koi8_is_mbc_ambiguous(OnigAmbigType flag, const OnigUChar** pp, const OnigUChar* end) -{ - const OnigUChar* p = *pp; - - (*pp)++; - if (((flag & ONIGENC_CASE_FOLD_ASCII_CASE) != 0 && - ONIGENC_IS_MBC_ASCII(p)) || - ((flag & ONIGENC_CASE_FOLD_NONASCII_CASE) != 0 && - !ONIGENC_IS_MBC_ASCII(p))) { - int v = (EncKOI8_CtypeTable[*p] & - (BIT_CTYPE_UPPER | BIT_CTYPE_LOWER)); - return (v != 0 ? TRUE : FALSE); - } - return FALSE; -} -#endif - static int koi8_is_code_ctype(OnigCodePoint code, unsigned int ctype) { diff --git a/src/koi8_r.c b/src/koi8_r.c index 1284f7f..c77302f 100644 --- a/src/koi8_r.c +++ b/src/koi8_r.c @@ -2,7 +2,7 @@ koi8_r.c - Oniguruma (regular expression library) **********************************************************************/ /*- - * Copyright (c) 2002-2018 K.Kosako + * Copyright (c) 2002-2019 K.Kosako * All rights reserved. * * Redistribution and use in source and binary forms, with or without @@ -114,19 +114,6 @@ koi8_r_mbc_case_fold(OnigCaseFoldType flag ARG_UNUSED, return 1; } -#if 0 -static int -koi8_r_is_mbc_ambiguous(OnigCaseFoldType flag, const UChar** pp, const UChar* end) -{ - int v; - const UChar* p = *pp; - - (*pp)++; - v = (EncKOI8_R_CtypeTable[*p] & (BIT_CTYPE_UPPER | BIT_CTYPE_LOWER)); - return (v != 0 ? TRUE : FALSE); -} -#endif - static int koi8_r_is_code_ctype(OnigCodePoint code, unsigned int ctype) { diff --git a/src/make_property.sh b/src/make_property.sh index bc5cf98..cef0a96 100755 --- a/src/make_property.sh +++ b/src/make_property.sh @@ -1,8 +1,9 @@ #!/bin/sh +GPERF=gperf + TMP1=gperf1.tmp TMP2=gperf2.tmp -GPERF=/usr/local/bin/gperf GPERF_OPT='-pt -T -L ANSI-C' diff --git a/src/make_unicode_egcb_data.py b/src/make_unicode_egcb_data.py index 0f63f97..9c71796 100755 --- a/src/make_unicode_egcb_data.py +++ b/src/make_unicode_egcb_data.py @@ -1,7 +1,7 @@ #!/usr/bin/python # -*- coding: utf-8 -*- # make_unicode_egcb_data.py -# Copyright (c) 2017-2018 K.Kosako +# Copyright (c) 2017-2019 K.Kosako import sys import re @@ -13,18 +13,19 @@ PR_LINE_REG = re.compile("([0-9A-Fa-f]+)(?:..([0-9A-Fa-f]+))?\s*;\s*(\w+)") PA_LINE_REG = re.compile("(\w+)\s*;\s*(\w+)") PVA_LINE_REG = re.compile("(sc|gc)\s*;\s*(\w+)\s*;\s*(\w+)(?:\s*;\s*(\w+))?") BL_LINE_REG = re.compile("([0-9A-Fa-f]+)\.\.([0-9A-Fa-f]+)\s*;\s*(.*)") -VERSION_REG = re.compile("#\s*.*-(\d+\.\d+\.\d+)\.txt") +VERSION_REG = re.compile("#\s*.*-(\d+)\.(\d+)\.(\d+)\.txt") -VERSION_INFO = None +VERSION_INFO = [-1, -1, -1] DIC = { } PROPS = [] PropIndex = { } def check_version_info(s): - global VERSION_INFO m = VERSION_REG.match(s) if m is not None: - VERSION_INFO = m.group(1) + VERSION_INFO[0] = int(m.group(1)) + VERSION_INFO[1] = int(m.group(2)) + VERSION_INFO[2] = int(m.group(3)) def print_ranges(ranges): for (start, end) in ranges: @@ -160,7 +161,7 @@ def parse_properties(path): continue if s[0] == '#': - if VERSION_INFO is None: + if VERSION_INFO[0] < 0: check_version_info(s) m = PR_LINE_REG.match(s) @@ -194,7 +195,7 @@ PROPS = sorted(PROPS) print '/* unicode_egcb_data.c: Generated by make_unicode_egcb_data.py. */' COPYRIGHT = ''' /*- - * Copyright (c) 2017-2018 K.Kosako + * Copyright (c) 2017-2019 K.Kosako * All rights reserved. * * Redistribution and use in source and binary forms, with or without @@ -222,9 +223,11 @@ COPYRIGHT = ''' print COPYRIGHT print '' -if VERSION_INFO is not None: - print "#define GRAPHEME_BREAK_PROPERTY_VERSION %s" % re.sub(r'[\.-]', '_', VERSION_INFO) - print '' +if VERSION_INFO[0] < 0: + raise RuntimeError("Version is not found") + +print "#define GRAPHEME_BREAK_PROPERTY_VERSION %02d%02d%02d" % (VERSION_INFO[0], VERSION_INFO[1], VERSION_INFO[2]) +print '' ranges = [] for prop in PROPS: diff --git a/src/make_unicode_fold.sh b/src/make_unicode_fold.sh index 35ce974..1d5cc1e 100755 --- a/src/make_unicode_fold.sh +++ b/src/make_unicode_fold.sh @@ -1,6 +1,6 @@ #!/bin/sh -GPERF=/usr/local/bin/gperf +GPERF=gperf TMP0=gperf0.tmp TMP1=gperf1.tmp diff --git a/src/make_unicode_fold_data.py b/src/make_unicode_fold_data.py index 783988c..55d5b88 100755 --- a/src/make_unicode_fold_data.py +++ b/src/make_unicode_fold_data.py @@ -1,7 +1,7 @@ #!/usr/bin/python # -*- coding: utf-8 -*- # make_unicode_fold_data.py -# Copyright (c) 2016-2018 K.Kosako +# Copyright (c) 2016-2019 K.Kosako import sys import re @@ -16,9 +16,9 @@ DataName = 'OnigUnicodeFolds' ENCODING = 'utf-8' LINE_REG = re.compile("([0-9A-F]{1,6}); (.); ([0-9A-F]{1,6})(?: ([0-9A-F]{1,6}))?(?: ([0-9A-F]{1,6}))?;(?:\s*#\s*)(.*)") -VERSION_REG = re.compile("#.*-(\d+\.\d+\.\d+)\.txt") +VERSION_REG = re.compile("#.*-(\d+)\.(\d+)\.(\d+)\.txt") -VERSION_INFO = None +VERSION_INFO = [-1, -1, -1] FOLDS = {} TURKISH_FOLDS = {} @@ -56,18 +56,19 @@ def form3bytes(x): return "\\x%02x\\x%02x\\x%02x" % (x2, x1, x0) def check_version_info(s): - global VERSION_INFO - if VERSION_INFO is None: - m = VERSION_REG.match(s) - if m is not None: - VERSION_INFO = m.group(1) + m = VERSION_REG.match(s) + if m is not None: + VERSION_INFO[0] = int(m.group(1)) + VERSION_INFO[1] = int(m.group(2)) + VERSION_INFO[2] = int(m.group(3)) def parse_line(s): if len(s) == 0: - return False + return False if s[0] == '#': + if VERSION_INFO[0] < 0: check_version_info(s) - return False + return False m = LINE_REG.match(s) if m is None: @@ -232,9 +233,11 @@ def output_fold_source(f, out_comment): print >> f, "/* This file was generated by make_unicode_fold_data.py. */" print >> f, '#include "regenc.h"' print >> f, '' - if VERSION_INFO is not None: - print "#define UNICODE_CASEFOLD_VERSION %s" % re.sub(r'[\.-]', '_', VERSION_INFO) - print '' + if VERSION_INFO[0] < 0: + raise RuntimeError("Version is not found") + + print "#define UNICODE_CASEFOLD_VERSION %02d%02d%02d" % (VERSION_INFO[0], VERSION_INFO[1], VERSION_INFO[2]) + print '' #output_macros(f, DataName) print >> f, '' #output_typedef(f) @@ -246,7 +249,7 @@ HEAD = ''' /* This gperf source file was generated by make_unicode_fold_data.py */ /*- - * Copyright (c) 2017-2018 K.Kosako + * Copyright (c) 2017-2019 K.Kosako * All rights reserved. * * Redistribution and use in source and binary forms, with or without diff --git a/src/make_unicode_property.sh b/src/make_unicode_property.sh index 124d76a..51c8951 100755 --- a/src/make_unicode_property.sh +++ b/src/make_unicode_property.sh @@ -1,10 +1,11 @@ #!/bin/sh +GPERF=gperf + NAME=unicode_property_data TMP1=gperf1.tmp TMP2=gperf2.tmp TMP= -GPERF=/usr/local/bin/gperf GPERF_OPT='-T -C -c -t -j1 -L ANSI-C --ignore-case --pic -Q unicode_prop_name_pool' POOL_CAST='s/\(int *\)\(size_t *\)&\(\(struct +unicode_prop_name_pool_t *\* *\) *0\)->unicode_prop_name_pool_str([^,]+)/pool_offset(\1)/g' diff --git a/src/make_unicode_property_data.py b/src/make_unicode_property_data.py index dc3071a..9776628 100755 --- a/src/make_unicode_property_data.py +++ b/src/make_unicode_property_data.py @@ -1,7 +1,7 @@ #!/usr/bin/python # -*- coding: utf-8 -*- # make_unicode_property_data.py -# Copyright (c) 2016-2018 K.Kosako +# Copyright (c) 2016-2019 K.Kosako import sys import re @@ -22,9 +22,12 @@ PR_LINE_REG = re.compile("([0-9A-Fa-f]+)(?:..([0-9A-Fa-f]+))?\s*;\s*(\w+)") PA_LINE_REG = re.compile("(\w+)\s*;\s*(\w+)") PVA_LINE_REG = re.compile("(sc|gc)\s*;\s*(\w+)\s*;\s*(\w+)(?:\s*;\s*(\w+))?") BL_LINE_REG = re.compile("([0-9A-Fa-f]+)\.\.([0-9A-Fa-f]+)\s*;\s*(.*)") -VERSION_REG = re.compile("#\s*.*-(\d+\.\d+\.\d+)\.txt") +UNICODE_VERSION_REG = re.compile("#\s*.*-(\d+)\.(\d+)\.(\d+)\.txt") +EMOJI_VERSION_REG = re.compile("(?i)#\s*Version:\s*(\d+)\.(\d+)") + +VERSION_INFO = [-1, -1, -1] +EMOJI_VERSION_INFO = [-1, -1] -VERSION_INFO = None DIC = { } KDIC = { } PropIndex = { } @@ -40,13 +43,6 @@ def fix_block_name(name): s = re.sub(r'[- ]+', '_', name) return 'In_' + s -def check_version_info(s): - global VERSION_INFO - m = VERSION_REG.match(s) - if m is not None: - VERSION_INFO = m.group(1) - - def print_ranges(ranges): for (start, end) in ranges: print "0x%06x, 0x%06x" % (start, end) @@ -233,7 +229,8 @@ def parse_unicode_data_file(f): normalize_ranges_in_dic(dic) return dic, assigned -def parse_properties(path, klass, prop_prefix = None): +def parse_properties(path, klass, prop_prefix = None, version_reg = None): + version_match = None with open(path, 'r') as f: dic = { } prop = None @@ -243,9 +240,10 @@ def parse_properties(path, klass, prop_prefix = None): if len(s) == 0: continue - if s[0] == '#': - if VERSION_INFO is None: - check_version_info(s) + if s[0] == '#' and version_reg is not None and version_match is None: + version_match = version_reg.match(s) + if version_match is not None: + continue m = PR_LINE_REG.match(s) if m: @@ -266,7 +264,7 @@ def parse_properties(path, klass, prop_prefix = None): props.append(prop) normalize_ranges_in_dic(dic) - return (dic, props) + return (dic, props, version_match) def parse_property_aliases(path): a = { } @@ -414,11 +412,11 @@ def entry_and_print_prop_and_index(name, index): nname = normalize_prop_name(name) print_prop_and_index(nname, index) -def parse_and_merge_properties(path, klass): - dic, props = parse_properties(path, klass) +def parse_and_merge_properties(path, klass, prop_prefix = None, version_reg = None): + dic, props, ver_m = parse_properties(path, klass, prop_prefix, version_reg) merge_dic(DIC, dic) merge_props(PROPS, props) - return dic, props + return dic, props, ver_m ### main ### argv = sys.argv @@ -447,11 +445,21 @@ with open('UnicodeData.txt', 'r') as f: PROPS = DIC.keys() PROPS = list_sub(PROPS, POSIX_LIST) -parse_and_merge_properties('DerivedCoreProperties.txt', 'Derived Property') -dic, props = parse_and_merge_properties('Scripts.txt', 'Script') +_, _, ver_m = parse_and_merge_properties('DerivedCoreProperties.txt', 'Derived Property', None, UNICODE_VERSION_REG) +if ver_m is not None: + VERSION_INFO[0] = int(ver_m.group(1)) + VERSION_INFO[1] = int(ver_m.group(2)) + VERSION_INFO[2] = int(ver_m.group(3)) + +dic, props, _ = parse_and_merge_properties('Scripts.txt', 'Script') DIC['Unknown'] = inverse_ranges(add_ranges_in_dic(dic)) + parse_and_merge_properties('PropList.txt', 'Binary Property') -parse_and_merge_properties('emoji-data.txt', 'Emoji Property') + +_, _, ver_m = parse_and_merge_properties('emoji-data.txt', 'Emoji Property', None, EMOJI_VERSION_REG) +if ver_m is not None: + EMOJI_VERSION_INFO[0] = int(ver_m.group(1)) + EMOJI_VERSION_INFO[1] = int(ver_m.group(2)) PROPS.append('Unknown') KDIC['Unknown'] = 'Script' @@ -464,9 +472,9 @@ dic, BLOCKS = parse_blocks('Blocks.txt') merge_dic(DIC, dic) if INCLUDE_GRAPHEME_CLUSTER_DATA: - dic, props = parse_properties('GraphemeBreakProperty.txt', - 'GraphemeBreak Property', - GRAPHEME_CLUSTER_BREAK_NAME_PREFIX) + dic, props, _ = parse_properties('GraphemeBreakProperty.txt', + 'GraphemeBreak Property', + GRAPHEME_CLUSTER_BREAK_NAME_PREFIX) merge_dic(DIC, dic) merge_props(PROPS, props) #prop = GRAPHEME_CLUSTER_BREAK_NAME_PREFIX + 'Other' @@ -533,9 +541,13 @@ sys.stdout.write(s) if OUTPUT_LIST_MODE: UPF = open("UNICODE_PROPERTIES", "w") - if VERSION_INFO is not None: - print >> UPF, "Unicode Properties (from Unicode Version: %s)" % VERSION_INFO - print >> UPF, '' + if VERSION_INFO[0] < 0: + raise RuntimeError("Unicode Version is not found") + if EMOJI_VERSION_INFO[0] < 0: + raise RuntimeError("Emoji Version is not found") + + print >> UPF, "Unicode Properties (Unicode Version: %d.%d.%d, Emoji: %d.%d)" % (VERSION_INFO[0], VERSION_INFO[1], VERSION_INFO[2], EMOJI_VERSION_INFO[0], EMOJI_VERSION_INFO[1]) + print >> UPF, '' index = -1 for prop in POSIX_LIST: @@ -569,9 +581,14 @@ if not(POSIX_ONLY): print '%%' print '' if not(POSIX_ONLY): - if VERSION_INFO is not None: - print "#define UNICODE_PROPERTY_VERSION %s" % re.sub(r'[\.-]', '_', VERSION_INFO) - print '' + if VERSION_INFO[0] < 0: + raise RuntimeError("Unicode Version is not found") + if EMOJI_VERSION_INFO[0] < 0: + raise RuntimeError("Emoji Version is not found") + + print "#define UNICODE_PROPERTY_VERSION %02d%02d%02d" % (VERSION_INFO[0], VERSION_INFO[1], VERSION_INFO[2]) + print "#define UNICODE_EMOJI_VERSION %02d%02d" % (EMOJI_VERSION_INFO[0], EMOJI_VERSION_INFO[1]) + print '' print "#define PROPERTY_NAME_MAX_SIZE %d" % (PROPERTY_NAME_MAX_LEN + 10) print "#define CODE_RANGES_NUM %d" % (index + 1) diff --git a/src/make_unicode_wb_data.py b/src/make_unicode_wb_data.py index 624fa7e..ddedd5d 100755 --- a/src/make_unicode_wb_data.py +++ b/src/make_unicode_wb_data.py @@ -13,18 +13,19 @@ PR_LINE_REG = re.compile("([0-9A-Fa-f]+)(?:..([0-9A-Fa-f]+))?\s*;\s*(\w+)") PA_LINE_REG = re.compile("(\w+)\s*;\s*(\w+)") PVA_LINE_REG = re.compile("(sc|gc)\s*;\s*(\w+)\s*;\s*(\w+)(?:\s*;\s*(\w+))?") BL_LINE_REG = re.compile("([0-9A-Fa-f]+)\.\.([0-9A-Fa-f]+)\s*;\s*(.*)") -VERSION_REG = re.compile("#\s*.*-(\d+\.\d+\.\d+)\.txt") +VERSION_REG = re.compile("#\s*.*-(\d+)\.(\d+)\.(\d+)\.txt") -VERSION_INFO = None +VERSION_INFO = [-1, -1, -1] DIC = { } PROPS = [] PropIndex = { } def check_version_info(s): - global VERSION_INFO m = VERSION_REG.match(s) if m is not None: - VERSION_INFO = m.group(1) + VERSION_INFO[0] = int(m.group(1)) + VERSION_INFO[1] = int(m.group(2)) + VERSION_INFO[2] = int(m.group(3)) def print_ranges(ranges): for (start, end) in ranges: @@ -160,7 +161,7 @@ def parse_properties(path): continue if s[0] == '#': - if VERSION_INFO is None: + if VERSION_INFO[0] < 0: check_version_info(s) m = PR_LINE_REG.match(s) @@ -194,7 +195,7 @@ PROPS = sorted(PROPS) print '/* unicode_wb_data.c: Generated by make_unicode_wb_data.py. */' COPYRIGHT = ''' /*- - * Copyright (c) 2019 K.Kosako + * Copyright (c) 2019 K.Kosako * All rights reserved. * * Redistribution and use in source and binary forms, with or without @@ -222,9 +223,11 @@ COPYRIGHT = ''' print COPYRIGHT print '' -if VERSION_INFO is not None: - print "#define WORD_BREAK_PROPERTY_VERSION %s" % re.sub(r'[\.-]', '_', VERSION_INFO) - print '' +if VERSION_INFO[0] < 0: + raise RuntimeError("Version is not found.") + +print "#define WORD_BREAK_PROPERTY_VERSION %02d%02d%02d" % (VERSION_INFO[0], VERSION_INFO[1], VERSION_INFO[2]) +print '' ranges = [] for prop in PROPS: diff --git a/src/mktable.c b/src/mktable.c index 80ac08a..318bac0 100644 --- a/src/mktable.c +++ b/src/mktable.c @@ -2,7 +2,7 @@ mktable.c **********************************************************************/ /*- - * Copyright (c) 2002-2019 K.Kosako + * Copyright (c) 2002-2019 K.Kosako * All rights reserved. * * Redistribution and use in source and binary forms, with or without diff --git a/src/onig_init.c b/src/onig_init.c index 7ad98b7..c660e7d 100644 --- a/src/onig_init.c +++ b/src/onig_init.c @@ -2,7 +2,7 @@ onig_init.c - Oniguruma (regular expression library) **********************************************************************/ /*- - * Copyright (c) 2016-2018 K.Kosako + * Copyright (c) 2016-2019 K.Kosako * All rights reserved. * * Redistribution and use in source and binary forms, with or without diff --git a/src/oniggnu.h b/src/oniggnu.h index d688883..96d9085 100644 --- a/src/oniggnu.h +++ b/src/oniggnu.h @@ -4,7 +4,7 @@ oniggnu.h - Oniguruma (regular expression library) **********************************************************************/ /*- - * Copyright (c) 2002-2005 K.Kosako + * Copyright (c) 2002-2019 K.Kosako * All rights reserved. * * Redistribution and use in source and binary forms, with or without diff --git a/src/onigposix.h b/src/onigposix.h index da0f919..5ff779f 100644 --- a/src/onigposix.h +++ b/src/onigposix.h @@ -4,7 +4,7 @@ onigposix.h - Oniguruma (regular expression library) **********************************************************************/ /*- - * Copyright (c) 2002-2018 K.Kosako + * Copyright (c) 2002-2019 K.Kosako * All rights reserved. * * Redistribution and use in source and binary forms, with or without @@ -95,6 +95,7 @@ typedef struct { #endif #endif +#ifndef ONIG_STATIC #ifndef ONIG_EXTERN #if defined(_WIN32) && !defined(__GNUC__) #if defined(ONIGURUMA_EXPORT) @@ -108,6 +109,9 @@ typedef struct { #ifndef ONIG_EXTERN #define ONIG_EXTERN extern #endif +#else +#define ONIG_EXTERN extern +#endif #ifndef ONIGURUMA_H typedef unsigned int OnigOptionType; diff --git a/src/oniguruma.h b/src/oniguruma.h index 90cf2d9..08ac6f7 100644 --- a/src/oniguruma.h +++ b/src/oniguruma.h @@ -4,7 +4,7 @@ oniguruma.h - Oniguruma (regular expression library) **********************************************************************/ /*- - * Copyright (c) 2002-2019 K.Kosako + * Copyright (c) 2002-2019 K.Kosako * All rights reserved. * * Redistribution and use in source and binary forms, with or without @@ -36,9 +36,9 @@ extern "C" { #define ONIGURUMA #define ONIGURUMA_VERSION_MAJOR 6 #define ONIGURUMA_VERSION_MINOR 9 -#define ONIGURUMA_VERSION_TEENY 3 +#define ONIGURUMA_VERSION_TEENY 4 -#define ONIGURUMA_VERSION_INT 60903 +#define ONIGURUMA_VERSION_INT 60904 #ifndef P_ #if defined(__STDC__) || defined(_WIN32) @@ -687,6 +687,14 @@ typedef OnigRegexType* OnigRegex; typedef OnigRegexType regex_t; #endif +struct OnigRegSetStruct; +typedef struct OnigRegSetStruct OnigRegSet; + +typedef enum { + ONIG_REGSET_POSITION_LEAD = 0, + ONIG_REGSET_REGEX_LEAD = 1, + ONIG_REGSET_PRIORITY_TO_REGEX_ORDER = 2 +} OnigRegSetLead; typedef struct { int num_of_elements; @@ -797,6 +805,26 @@ ONIG_EXTERN int onig_match P_((OnigRegex, const OnigUChar* str, const OnigUChar* end, const OnigUChar* at, OnigRegion* region, OnigOptionType option)); ONIG_EXTERN int onig_match_with_param P_((OnigRegex, const OnigUChar* str, const OnigUChar* end, const OnigUChar* at, OnigRegion* region, OnigOptionType option, OnigMatchParam* mp)); + +ONIG_EXTERN +int onig_regset_new P_((OnigRegSet** rset, int n, regex_t* regs[])); +ONIG_EXTERN +int onig_regset_add P_((OnigRegSet* set, regex_t* reg)); +ONIG_EXTERN +int onig_regset_replace P_((OnigRegSet* set, int at, regex_t* reg)); +ONIG_EXTERN +void onig_regset_free P_((OnigRegSet* set)); +ONIG_EXTERN +int onig_regset_number_of_regex P_((OnigRegSet* set)); +ONIG_EXTERN +regex_t* onig_regset_get_regex P_((OnigRegSet* set, int at)); +ONIG_EXTERN +OnigRegion* onig_regset_get_region P_((OnigRegSet* set, int at)); +ONIG_EXTERN +int onig_regset_search P_((OnigRegSet* set, const OnigUChar* str, const OnigUChar* end, const OnigUChar* start, const OnigUChar* range, OnigRegSetLead lead, OnigOptionType option, int* rmatch_pos)); +ONIG_EXTERN +int onig_regset_search_with_param P_((OnigRegSet* set, const OnigUChar* str, const OnigUChar* end, const OnigUChar* start, const OnigUChar* range, OnigRegSetLead lead, OnigOptionType option, OnigMatchParam* mps[], int* rmatch_pos)); + ONIG_EXTERN OnigRegion* onig_region_new P_((void)); ONIG_EXTERN diff --git a/src/regcomp.c b/src/regcomp.c index b96c793..69d4b95 100644 --- a/src/regcomp.c +++ b/src/regcomp.c @@ -2,7 +2,7 @@ regcomp.c - Oniguruma (regular expression library) **********************************************************************/ /*- - * Copyright (c) 2002-2019 K.Kosako + * Copyright (c) 2002-2019 K.Kosako * All rights reserved. * * Redistribution and use in source and binary forms, with or without @@ -224,17 +224,17 @@ ops_free(regex_t* reg) #endif switch (opcode) { - case OP_EXACTMBN: + case OP_STR_MBN: if (! is_in_string_pool(reg, op->exact_len_n.s)) xfree(op->exact_len_n.s); break; - case OP_EXACTN: case OP_EXACTMB2N: case OP_EXACTMB3N: case OP_EXACTN_IC: + case OP_STR_N: case OP_STR_MB2N: case OP_STR_MB3N: case OP_STR_N_IC: if (! is_in_string_pool(reg, op->exact_n.s)) xfree(op->exact_n.s); break; - case OP_EXACT1: case OP_EXACT2: case OP_EXACT3: case OP_EXACT4: - case OP_EXACT5: case OP_EXACTMB2N1: case OP_EXACTMB2N2: - case OP_EXACTMB2N3: case OP_EXACT1_IC: + case OP_STR_1: case OP_STR_2: case OP_STR_3: case OP_STR_4: + case OP_STR_5: case OP_STR_MB2N1: case OP_STR_MB2N2: + case OP_STR_MB2N3: case OP_STR_1_IC: break; case OP_CCLASS_NOT: case OP_CCLASS: @@ -298,17 +298,17 @@ ops_calc_size_of_string_pool(regex_t* reg) #endif switch (opcode) { - case OP_EXACTMBN: + case OP_STR_MBN: total += op->exact_len_n.len * op->exact_len_n.n; break; - case OP_EXACTN: - case OP_EXACTN_IC: + case OP_STR_N: + case OP_STR_N_IC: total += op->exact_n.n; break; - case OP_EXACTMB2N: + case OP_STR_MB2N: total += op->exact_n.n * 2; break; - case OP_EXACTMB3N: + case OP_STR_MB3N: total += op->exact_n.n * 3; break; @@ -349,15 +349,15 @@ ops_make_string_pool(regex_t* reg) #endif switch (opcode) { - case OP_EXACTMBN: + case OP_STR_MBN: len = op->exact_len_n.len * op->exact_len_n.n; xmemcpy(curr, op->exact_len_n.s, len); xfree(op->exact_len_n.s); op->exact_len_n.s = curr; curr += len; break; - case OP_EXACTN: - case OP_EXACTN_IC: + case OP_STR_N: + case OP_STR_N_IC: len = op->exact_n.n; copy: xmemcpy(curr, op->exact_n.s, len); @@ -365,11 +365,11 @@ ops_make_string_pool(regex_t* reg) op->exact_n.s = curr; curr += len; break; - case OP_EXACTMB2N: + case OP_STR_MB2N: len = op->exact_n.n * 2; goto copy; break; - case OP_EXACTMB3N: + case OP_STR_MB3N: len = op->exact_n.n * 3; goto copy; break; @@ -427,7 +427,7 @@ onig_positive_int_multiply(int x, int y) static void -swap_node(Node* a, Node* b) +node_swap(Node* a, Node* b) { Node c; @@ -452,6 +452,81 @@ swap_node(Node* a, Node* b) } } +static int +node_list_len(Node* list) +{ + int len; + + len = 1; + while (IS_NOT_NULL(NODE_CDR(list))) { + list = NODE_CDR(list); + len++; + } + + return len; +} + +static Node* +node_list_add(Node* list, Node* x) +{ + Node *n; + + n = onig_node_new_list(x, NULL); + if (IS_NULL(n)) return NULL_NODE; + + if (IS_NOT_NULL(list)) { + while (IS_NOT_NULL(NODE_CDR(list))) + list = NODE_CDR(list); + + NODE_CDR(list) = n; + } + + return n; +} + +static int +node_str_node_cat(Node* node, Node* add) +{ + int r; + + if (STR_(node)->flag != STR_(add)->flag) + return ONIGERR_TYPE_BUG; + + r = onig_node_str_cat(node, STR_(add)->s, STR_(add)->end); + if (r != 0) return r; + + if (NODE_STRING_IS_CASE_FOLD_MATCH(node)) + STR_(node)->case_min_len += STR_(add)->case_min_len; + + return 0; +} + +static int +node_str_cat_case_fold(Node* node, const UChar* s, const UChar* end, int case_min_len) +{ + int r; + + if (! NODE_STRING_IS_CASE_FOLD_MATCH(node)) + return ONIGERR_TYPE_BUG; + + r = onig_node_str_cat(node, s, end); + if (r != 0) return r; + + STR_(node)->case_min_len += case_min_len; + return 0; +} + +static void +node_conv_to_str_node(Node* node, int flag) +{ + NODE_SET_TYPE(node, NODE_STRING); + STR_(node)->flag = flag; + STR_(node)->s = STR_(node)->buf; + STR_(node)->end = STR_(node)->buf; + STR_(node)->capacity = 0; + STR_(node)->case_min_len = 0; +} + static OnigLen distance_add(OnigLen d1, OnigLen d2) { @@ -549,52 +624,45 @@ static int compile_length_tree(Node* node, regex_t* reg); static int compile_tree(Node* node, regex_t* reg, ScanEnv* env); -#define IS_NEED_STR_LEN_OP_EXACT(op) \ - ((op) == OP_EXACTN || (op) == OP_EXACTMB2N ||\ - (op) == OP_EXACTMB3N || (op) == OP_EXACTMBN || (op) == OP_EXACTN_IC) +#define IS_NEED_STR_LEN_OP(op) \ + ((op) == OP_STR_N || (op) == OP_STR_MB2N ||\ + (op) == OP_STR_MB3N || (op) == OP_STR_MBN || (op) == OP_STR_N_IC) static int -select_str_opcode(int mb_len, int str_len, int ignore_case) +select_str_opcode(int mb_len, int str_len) { int op; - if (ignore_case) { + switch (mb_len) { + case 1: switch (str_len) { - case 1: op = OP_EXACT1_IC; break; - default: op = OP_EXACTN_IC; break; + case 1: op = OP_STR_1; break; + case 2: op = OP_STR_2; break; + case 3: op = OP_STR_3; break; + case 4: op = OP_STR_4; break; + case 5: op = OP_STR_5; break; + default: op = OP_STR_N; break; } - } - else { - switch (mb_len) { - case 1: - switch (str_len) { - case 1: op = OP_EXACT1; break; - case 2: op = OP_EXACT2; break; - case 3: op = OP_EXACT3; break; - case 4: op = OP_EXACT4; break; - case 5: op = OP_EXACT5; break; - default: op = OP_EXACTN; break; - } - break; + break; - case 2: - switch (str_len) { - case 1: op = OP_EXACTMB2N1; break; - case 2: op = OP_EXACTMB2N2; break; - case 3: op = OP_EXACTMB2N3; break; - default: op = OP_EXACTMB2N; break; - } - break; + case 2: + switch (str_len) { + case 1: op = OP_STR_MB2N1; break; + case 2: op = OP_STR_MB2N2; break; + case 3: op = OP_STR_MB2N3; break; + default: op = OP_STR_MB2N; break; + } + break; - case 3: - op = OP_EXACTMB3N; - break; + case 3: + op = OP_STR_MB3N; + break; - default: - op = OP_EXACTMBN; - break; - } + default: + op = OP_STR_MBN; + break; } + return op; } @@ -621,31 +689,43 @@ is_strict_real_node(Node* node) } static int -compile_tree_empty_check(Node* node, regex_t* reg, int emptiness, ScanEnv* env) +compile_quant_body_with_empty_check(QuantNode* qn, regex_t* reg, ScanEnv* env) { int r; - int saved_num_null_check = reg->num_null_check; + int saved_num_empty_check; + int emptiness; + Node* body; + + body = NODE_BODY((Node* )qn); + emptiness = qn->emptiness; + saved_num_empty_check = reg->num_empty_check; if (emptiness != BODY_IS_NOT_EMPTY) { r = add_op(reg, OP_EMPTY_CHECK_START); if (r != 0) return r; - COP(reg)->empty_check_start.mem = reg->num_null_check; /* NULL CHECK ID */ - reg->num_null_check++; + COP(reg)->empty_check_start.mem = reg->num_empty_check; /* NULL CHECK ID */ + reg->num_empty_check++; } - r = compile_tree(node, reg, env); + r = compile_tree(body, reg, env); if (r != 0) return r; if (emptiness != BODY_IS_NOT_EMPTY) { if (emptiness == BODY_IS_EMPTY_POSSIBILITY) r = add_op(reg, OP_EMPTY_CHECK_END); - else if (emptiness == BODY_IS_EMPTY_POSSIBILITY_MEM) - r = add_op(reg, OP_EMPTY_CHECK_END_MEMST); + else if (emptiness == BODY_IS_EMPTY_POSSIBILITY_MEM) { + if (NODE_IS_EMPTY_STATUS_CHECK(qn) != 0) + r = add_op(reg, OP_EMPTY_CHECK_END_MEMST); + else + r = add_op(reg, OP_EMPTY_CHECK_END); + } +#ifdef USE_CALL else if (emptiness == BODY_IS_EMPTY_POSSIBILITY_REC) r = add_op(reg, OP_EMPTY_CHECK_END_MEMST_PUSH); +#endif if (r != 0) return r; - COP(reg)->empty_check_end.mem = saved_num_null_check; /* NULL CHECK ID */ + COP(reg)->empty_check_end.mem = saved_num_empty_check; /* NULL CHECK ID */ } return r; } @@ -682,14 +762,13 @@ compile_tree_n_times(Node* node, int n, regex_t* reg, ScanEnv* env) static int add_compile_string_length(UChar* s ARG_UNUSED, int mb_len, int str_len, - regex_t* reg ARG_UNUSED, int ignore_case) + regex_t* reg ARG_UNUSED) { return 1; } static int -add_compile_string(UChar* s, int mb_len, int str_len, - regex_t* reg, int ignore_case) +add_compile_string(UChar* s, int mb_len, int str_len, regex_t* reg) { int op; int r; @@ -697,14 +776,14 @@ add_compile_string(UChar* s, int mb_len, int str_len, UChar* p; UChar* end; - op = select_str_opcode(mb_len, str_len, ignore_case); + op = select_str_opcode(mb_len, str_len); r = add_op(reg, op); if (r != 0) return r; byte_len = mb_len * str_len; end = s + byte_len; - if (op == OP_EXACTMBN) { + if (op == OP_STR_MBN) { p = onigenc_strdup(reg->enc, s, end); CHECK_NULL_RETURN_MEMERR(p); @@ -712,11 +791,11 @@ add_compile_string(UChar* s, int mb_len, int str_len, COP(reg)->exact_len_n.n = str_len; COP(reg)->exact_len_n.s = p; } - else if (IS_NEED_STR_LEN_OP_EXACT(op)) { + else if (IS_NEED_STR_LEN_OP(op)) { p = onigenc_strdup(reg->enc, s, end); CHECK_NULL_RETURN_MEMERR(p); - if (op == OP_EXACTN_IC) + if (op == OP_STR_N_IC) COP(reg)->exact_n.n = byte_len; else COP(reg)->exact_n.n = str_len; @@ -724,8 +803,8 @@ add_compile_string(UChar* s, int mb_len, int str_len, COP(reg)->exact_n.s = p; } else { + xmemset(COP(reg)->exact.s, 0, sizeof(COP(reg)->exact.s)); xmemcpy(COP(reg)->exact.s, s, (size_t )byte_len); - COP(reg)->exact.s[byte_len] = '\0'; } return 0; @@ -734,7 +813,7 @@ add_compile_string(UChar* s, int mb_len, int str_len, static int compile_length_string_node(Node* node, regex_t* reg) { - int rlen, r, len, prev_len, slen, ambig; + int rlen, r, len, prev_len, slen; UChar *p, *prev; StrNode* sn; OnigEncoding enc = reg->enc; @@ -743,7 +822,7 @@ compile_length_string_node(Node* node, regex_t* reg) if (sn->end <= sn->s) return 0; - ambig = NODE_STRING_IS_AMBIG(node); + if (NODE_STRING_IS_CASE_FOLD_MATCH(node) != 0) return 1; p = prev = sn->s; prev_len = enclen(enc, p); @@ -757,7 +836,7 @@ compile_length_string_node(Node* node, regex_t* reg) slen++; } else { - r = add_compile_string_length(prev, prev_len, slen, reg, ambig); + r = add_compile_string_length(prev, prev_len, slen, reg); rlen += r; prev = p; slen = 1; @@ -766,25 +845,59 @@ compile_length_string_node(Node* node, regex_t* reg) p += len; } - r = add_compile_string_length(prev, prev_len, slen, reg, ambig); + r = add_compile_string_length(prev, prev_len, slen, reg); rlen += r; return rlen; } static int -compile_length_string_raw_node(StrNode* sn, regex_t* reg) +compile_length_string_crude_node(StrNode* sn, regex_t* reg) { if (sn->end <= sn->s) return 0; return add_compile_string_length(sn->s, 1 /* sb */, (int )(sn->end - sn->s), - reg, 0); + reg); +} + +static int +compile_ambig_string_node(Node* node, regex_t* reg) +{ + int r; + int len; + int byte_len; + UChar* p; + StrNode* sn; + OnigEncoding enc = reg->enc; + + sn = STR_(node); + len = enclen(enc, sn->s); + byte_len = (int )(sn->end - sn->s); + if (len == byte_len) { + r = add_op(reg, OP_STR_1_IC); + if (r != 0) return r; + + xmemset(COP(reg)->exact.s, 0, sizeof(COP(reg)->exact.s)); + xmemcpy(COP(reg)->exact.s, sn->s, (size_t )byte_len); + } + else { + r = add_op(reg, OP_STR_N_IC); + if (r != 0) return r; + + p = onigenc_strdup(enc, sn->s, sn->end); + CHECK_NULL_RETURN_MEMERR(p); + + COP(reg)->exact_n.s = p; + COP(reg)->exact_n.n = byte_len; + } + + return 0; } static int compile_string_node(Node* node, regex_t* reg) { - int r, len, prev_len, slen, ambig; + int r, len, prev_len, slen; UChar *p, *prev, *end; StrNode* sn; OnigEncoding enc = reg->enc; @@ -794,7 +907,9 @@ compile_string_node(Node* node, regex_t* reg) return 0; end = sn->end; - ambig = NODE_STRING_IS_AMBIG(node); + if (NODE_STRING_IS_CASE_FOLD_MATCH(node) != 0) { + return compile_ambig_string_node(node, reg); + } p = prev = sn->s; prev_len = enclen(enc, p); @@ -807,7 +922,7 @@ compile_string_node(Node* node, regex_t* reg) slen++; } else { - r = add_compile_string(prev, prev_len, slen, reg, ambig); + r = add_compile_string(prev, prev_len, slen, reg); if (r != 0) return r; prev = p; @@ -818,16 +933,16 @@ compile_string_node(Node* node, regex_t* reg) p += len; } - return add_compile_string(prev, prev_len, slen, reg, ambig); + return add_compile_string(prev, prev_len, slen, reg); } static int -compile_string_raw_node(StrNode* sn, regex_t* reg) +compile_string_crude_node(StrNode* sn, regex_t* reg) { if (sn->end <= sn->s) return 0; - return add_compile_string(sn->s, 1 /* sb */, (int )(sn->end - sn->s), reg, 0); + return add_compile_string(sn->s, 1 /* sb */, (int )(sn->end - sn->s), reg); } static void* @@ -891,15 +1006,27 @@ compile_cclass_node(CClassNode* cc, regex_t* reg) return 0; } +static void +set_addr_in_repeat_range(regex_t* reg) +{ + int i; + + for (i = 0; i < reg->num_repeat; i++) { + RepeatRange* p = reg->repeat_range + i; + int offset = p->u.offset; + p->u.pcode = reg->ops + offset; + } +} + static int -entry_repeat_range(regex_t* reg, int id, int lower, int upper) +entry_repeat_range(regex_t* reg, int id, int lower, int upper, int ops_index) { #define REPEAT_RANGE_ALLOC 4 - OnigRepeatRange* p; + RepeatRange* p; if (reg->repeat_range_alloc == 0) { - p = (OnigRepeatRange* )xmalloc(sizeof(OnigRepeatRange) * REPEAT_RANGE_ALLOC); + p = (RepeatRange* )xmalloc(sizeof(RepeatRange) * REPEAT_RANGE_ALLOC); CHECK_NULL_RETURN_MEMERR(p); reg->repeat_range = p; reg->repeat_range_alloc = REPEAT_RANGE_ALLOC; @@ -907,7 +1034,7 @@ entry_repeat_range(regex_t* reg, int id, int lower, int upper) else if (reg->repeat_range_alloc <= id) { int n; n = reg->repeat_range_alloc + REPEAT_RANGE_ALLOC; - p = (OnigRepeatRange* )xrealloc(reg->repeat_range, sizeof(OnigRepeatRange) * n); + p = (RepeatRange* )xrealloc(reg->repeat_range, sizeof(RepeatRange) * n); CHECK_NULL_RETURN_MEMERR(p); reg->repeat_range = p; reg->repeat_range_alloc = n; @@ -916,8 +1043,9 @@ entry_repeat_range(regex_t* reg, int id, int lower, int upper) p = reg->repeat_range; } - p[id].lower = lower; - p[id].upper = (IS_INFINITE_REPEAT(upper) ? 0x7fffffff : upper); + p[id].lower = lower; + p[id].upper = (IS_INFINITE_REPEAT(upper) ? 0x7fffffff : upper); + p[id].u.offset = ops_index; return 0; } @@ -932,24 +1060,16 @@ compile_range_repeat_node(QuantNode* qn, int target_len, int emptiness, if (r != 0) return r; COP(reg)->repeat.id = num_repeat; - COP(reg)->repeat.addr = SIZE_INC_OP + target_len + SIZE_OP_REPEAT_INC; + COP(reg)->repeat.addr = SIZE_INC + target_len + OPSIZE_REPEAT_INC; - r = entry_repeat_range(reg, num_repeat, qn->lower, qn->upper); + r = entry_repeat_range(reg, num_repeat, qn->lower, qn->upper, + COP_CURR_OFFSET(reg) + OPSIZE_REPEAT); if (r != 0) return r; - r = compile_tree_empty_check(NODE_QUANT_BODY(qn), reg, emptiness, env); + r = compile_quant_body_with_empty_check(qn, reg, env); if (r != 0) return r; - if ( -#ifdef USE_CALL - NODE_IS_IN_MULTI_ENTRY(qn) || -#endif - NODE_IS_IN_REAL_REPEAT(qn)) { - r = add_op(reg, qn->greedy ? OP_REPEAT_INC_SG : OP_REPEAT_INC_NG_SG); - } - else { - r = add_op(reg, qn->greedy ? OP_REPEAT_INC : OP_REPEAT_INC_NG); - } + r = add_op(reg, qn->greedy ? OP_REPEAT_INC : OP_REPEAT_INC_NG); if (r != 0) return r; COP(reg)->repeat_inc.id = num_repeat; @@ -985,21 +1105,21 @@ compile_length_quantifier_node(QuantNode* qn, regex_t* reg) if (qn->lower <= 1 || int_multiply_cmp(tlen, qn->lower, QUANTIFIER_EXPAND_LIMIT_SIZE) <= 0) { if (IS_NOT_NULL(qn->next_head_exact)) - return SIZE_OP_ANYCHAR_STAR_PEEK_NEXT + tlen * qn->lower; + return OPSIZE_ANYCHAR_STAR_PEEK_NEXT + tlen * qn->lower; else - return SIZE_OP_ANYCHAR_STAR + tlen * qn->lower; + return OPSIZE_ANYCHAR_STAR + tlen * qn->lower; } } mod_tlen = tlen; if (emptiness != BODY_IS_NOT_EMPTY) - mod_tlen += SIZE_OP_EMPTY_CHECK_START + SIZE_OP_EMPTY_CHECK_END; + mod_tlen += OPSIZE_EMPTY_CHECK_START + OPSIZE_EMPTY_CHECK_END; if (infinite && (qn->lower <= 1 || int_multiply_cmp(tlen, qn->lower, QUANTIFIER_EXPAND_LIMIT_SIZE) <= 0)) { if (qn->lower == 1 && tlen > QUANTIFIER_EXPAND_LIMIT_SIZE) { - len = SIZE_OP_JUMP; + len = OPSIZE_JUMP; } else { len = tlen * qn->lower; @@ -1008,36 +1128,36 @@ compile_length_quantifier_node(QuantNode* qn, regex_t* reg) if (qn->greedy) { #ifdef USE_OP_PUSH_OR_JUMP_EXACT if (IS_NOT_NULL(qn->head_exact)) - len += SIZE_OP_PUSH_OR_JUMP_EXACT1 + mod_tlen + SIZE_OP_JUMP; + len += OPSIZE_PUSH_OR_JUMP_EXACT1 + mod_tlen + OPSIZE_JUMP; else #endif if (IS_NOT_NULL(qn->next_head_exact)) - len += SIZE_OP_PUSH_IF_PEEK_NEXT + mod_tlen + SIZE_OP_JUMP; + len += OPSIZE_PUSH_IF_PEEK_NEXT + mod_tlen + OPSIZE_JUMP; else - len += SIZE_OP_PUSH + mod_tlen + SIZE_OP_JUMP; + len += OPSIZE_PUSH + mod_tlen + OPSIZE_JUMP; } else - len += SIZE_OP_JUMP + mod_tlen + SIZE_OP_PUSH; + len += OPSIZE_JUMP + mod_tlen + OPSIZE_PUSH; } else if (qn->upper == 0) { - if (qn->is_refered != 0) { /* /(?..){0}/ */ - len = SIZE_OP_JUMP + tlen; + if (qn->include_referred != 0) { /* /(?..){0}/ */ + len = OPSIZE_JUMP + tlen; } else len = 0; } else if (!infinite && qn->greedy && (qn->upper == 1 || - int_multiply_cmp(tlen + SIZE_OP_PUSH, qn->upper, + int_multiply_cmp(tlen + OPSIZE_PUSH, qn->upper, QUANTIFIER_EXPAND_LIMIT_SIZE) <= 0)) { len = tlen * qn->lower; - len += (SIZE_OP_PUSH + tlen) * (qn->upper - qn->lower); + len += (OPSIZE_PUSH + tlen) * (qn->upper - qn->lower); } else if (!qn->greedy && qn->upper == 1 && qn->lower == 0) { /* '??' */ - len = SIZE_OP_PUSH + SIZE_OP_JUMP + tlen; + len = OPSIZE_PUSH + OPSIZE_JUMP + tlen; } else { - len = SIZE_OP_REPEAT_INC + mod_tlen + SIZE_OP_REPEAT; + len = OPSIZE_REPEAT_INC + mod_tlen + OPSIZE_REPEAT; } return len; @@ -1078,7 +1198,7 @@ compile_quantifier_node(QuantNode* qn, regex_t* reg, ScanEnv* env) mod_tlen = tlen; if (emptiness != BODY_IS_NOT_EMPTY) - mod_tlen += SIZE_OP_EMPTY_CHECK_START + SIZE_OP_EMPTY_CHECK_END; + mod_tlen += OPSIZE_EMPTY_CHECK_START + OPSIZE_EMPTY_CHECK_END; if (infinite && (qn->lower <= 1 || @@ -1091,16 +1211,16 @@ compile_quantifier_node(QuantNode* qn, regex_t* reg, ScanEnv* env) if (qn->greedy) { #ifdef USE_OP_PUSH_OR_JUMP_EXACT if (IS_NOT_NULL(qn->head_exact)) - COP(reg)->jump.addr = SIZE_OP_PUSH_OR_JUMP_EXACT1 + SIZE_INC_OP; + COP(reg)->jump.addr = OPSIZE_PUSH_OR_JUMP_EXACT1 + SIZE_INC; else #endif if (IS_NOT_NULL(qn->next_head_exact)) - COP(reg)->jump.addr = SIZE_OP_PUSH_IF_PEEK_NEXT + SIZE_INC_OP; + COP(reg)->jump.addr = OPSIZE_PUSH_IF_PEEK_NEXT + SIZE_INC; else - COP(reg)->jump.addr = SIZE_OP_PUSH + SIZE_INC_OP; + COP(reg)->jump.addr = OPSIZE_PUSH + SIZE_INC; } else { - COP(reg)->jump.addr = SIZE_OP_JUMP + SIZE_INC_OP; + COP(reg)->jump.addr = OPSIZE_JUMP + SIZE_INC; } } else { @@ -1113,36 +1233,36 @@ compile_quantifier_node(QuantNode* qn, regex_t* reg, ScanEnv* env) if (IS_NOT_NULL(qn->head_exact)) { r = add_op(reg, OP_PUSH_OR_JUMP_EXACT1); if (r != 0) return r; - COP(reg)->push_or_jump_exact1.addr = SIZE_INC_OP + mod_tlen + SIZE_OP_JUMP; + COP(reg)->push_or_jump_exact1.addr = SIZE_INC + mod_tlen + OPSIZE_JUMP; COP(reg)->push_or_jump_exact1.c = STR_(qn->head_exact)->s[0]; - r = compile_tree_empty_check(NODE_QUANT_BODY(qn), reg, emptiness, env); + r = compile_quant_body_with_empty_check(qn, reg, env); if (r != 0) return r; - addr = -(mod_tlen + (int )SIZE_OP_PUSH_OR_JUMP_EXACT1); + addr = -(mod_tlen + (int )OPSIZE_PUSH_OR_JUMP_EXACT1); } else #endif if (IS_NOT_NULL(qn->next_head_exact)) { r = add_op(reg, OP_PUSH_IF_PEEK_NEXT); if (r != 0) return r; - COP(reg)->push_if_peek_next.addr = SIZE_INC_OP + mod_tlen + SIZE_OP_JUMP; + COP(reg)->push_if_peek_next.addr = SIZE_INC + mod_tlen + OPSIZE_JUMP; COP(reg)->push_if_peek_next.c = STR_(qn->next_head_exact)->s[0]; - r = compile_tree_empty_check(NODE_QUANT_BODY(qn), reg, emptiness, env); + r = compile_quant_body_with_empty_check(qn, reg, env); if (r != 0) return r; - addr = -(mod_tlen + (int )SIZE_OP_PUSH_IF_PEEK_NEXT); + addr = -(mod_tlen + (int )OPSIZE_PUSH_IF_PEEK_NEXT); } else { r = add_op(reg, OP_PUSH); if (r != 0) return r; - COP(reg)->push.addr = SIZE_INC_OP + mod_tlen + SIZE_OP_JUMP; + COP(reg)->push.addr = SIZE_INC + mod_tlen + OPSIZE_JUMP; - r = compile_tree_empty_check(NODE_QUANT_BODY(qn), reg, emptiness, env); + r = compile_quant_body_with_empty_check(qn, reg, env); if (r != 0) return r; - addr = -(mod_tlen + (int )SIZE_OP_PUSH); + addr = -(mod_tlen + (int )OPSIZE_PUSH); } r = add_op(reg, OP_JUMP); @@ -1152,9 +1272,9 @@ compile_quantifier_node(QuantNode* qn, regex_t* reg, ScanEnv* env) else { r = add_op(reg, OP_JUMP); if (r != 0) return r; - COP(reg)->jump.addr = mod_tlen + SIZE_INC_OP; + COP(reg)->jump.addr = mod_tlen + SIZE_INC; - r = compile_tree_empty_check(NODE_QUANT_BODY(qn), reg, emptiness, env); + r = compile_quant_body_with_empty_check(qn, reg, env); if (r != 0) return r; r = add_op(reg, OP_PUSH); @@ -1163,10 +1283,10 @@ compile_quantifier_node(QuantNode* qn, regex_t* reg, ScanEnv* env) } } else if (qn->upper == 0) { - if (qn->is_refered != 0) { /* /(?..){0}/ */ + if (qn->include_referred != 0) { /* /(?..){0}/ */ r = add_op(reg, OP_JUMP); if (r != 0) return r; - COP(reg)->jump.addr = tlen + SIZE_INC_OP; + COP(reg)->jump.addr = tlen + SIZE_INC; r = compile_tree(NODE_QUANT_BODY(qn), reg, env); } @@ -1177,7 +1297,7 @@ compile_quantifier_node(QuantNode* qn, regex_t* reg, ScanEnv* env) } else if (! infinite && qn->greedy && (qn->upper == 1 || - int_multiply_cmp(tlen + SIZE_OP_PUSH, qn->upper, + int_multiply_cmp(tlen + OPSIZE_PUSH, qn->upper, QUANTIFIER_EXPAND_LIMIT_SIZE) <= 0)) { int n = qn->upper - qn->lower; @@ -1185,7 +1305,7 @@ compile_quantifier_node(QuantNode* qn, regex_t* reg, ScanEnv* env) if (r != 0) return r; for (i = 0; i < n; i++) { - int v = onig_positive_int_multiply(n - i, tlen + SIZE_OP_PUSH); + int v = onig_positive_int_multiply(n - i, tlen + OPSIZE_PUSH); if (v < 0) return ONIGERR_TOO_BIG_NUMBER_FOR_REPEAT_RANGE; r = add_op(reg, OP_PUSH); @@ -1199,11 +1319,11 @@ compile_quantifier_node(QuantNode* qn, regex_t* reg, ScanEnv* env) else if (! qn->greedy && qn->upper == 1 && qn->lower == 0) { /* '??' */ r = add_op(reg, OP_PUSH); if (r != 0) return r; - COP(reg)->push.addr = SIZE_INC_OP + SIZE_OP_JUMP; + COP(reg)->push.addr = SIZE_INC + OPSIZE_JUMP; r = add_op(reg, OP_JUMP); if (r != 0) return r; - COP(reg)->jump.addr = tlen + SIZE_INC_OP; + COP(reg)->jump.addr = tlen + SIZE_INC; r = compile_tree(NODE_QUANT_BODY(qn), reg, env); } @@ -1260,35 +1380,35 @@ compile_length_bag_node(BagNode* node, regex_t* reg) #ifdef USE_CALL if (node->m.regnum == 0 && NODE_IS_CALLED(node)) { - len = tlen + SIZE_OP_CALL + SIZE_OP_JUMP + SIZE_OP_RETURN; + len = tlen + OPSIZE_CALL + OPSIZE_JUMP + OPSIZE_RETURN; return len; } if (NODE_IS_CALLED(node)) { - len = SIZE_OP_MEMORY_START_PUSH + tlen - + SIZE_OP_CALL + SIZE_OP_JUMP + SIZE_OP_RETURN; - if (MEM_STATUS_AT0(reg->bt_mem_end, node->m.regnum)) + len = OPSIZE_MEM_START_PUSH + tlen + + OPSIZE_CALL + OPSIZE_JUMP + OPSIZE_RETURN; + if (MEM_STATUS_AT0(reg->push_mem_end, node->m.regnum)) len += (NODE_IS_RECURSION(node) - ? SIZE_OP_MEMORY_END_PUSH_REC : SIZE_OP_MEMORY_END_PUSH); + ? OPSIZE_MEM_END_PUSH_REC : OPSIZE_MEM_END_PUSH); else len += (NODE_IS_RECURSION(node) - ? SIZE_OP_MEMORY_END_REC : SIZE_OP_MEMORY_END); + ? OPSIZE_MEM_END_REC : OPSIZE_MEM_END); } else if (NODE_IS_RECURSION(node)) { - len = SIZE_OP_MEMORY_START_PUSH; - len += tlen + (MEM_STATUS_AT0(reg->bt_mem_end, node->m.regnum) - ? SIZE_OP_MEMORY_END_PUSH_REC : SIZE_OP_MEMORY_END_REC); + len = OPSIZE_MEM_START_PUSH; + len += tlen + (MEM_STATUS_AT0(reg->push_mem_end, node->m.regnum) + ? OPSIZE_MEM_END_PUSH_REC : OPSIZE_MEM_END_REC); } else #endif { - if (MEM_STATUS_AT0(reg->bt_mem_start, node->m.regnum)) - len = SIZE_OP_MEMORY_START_PUSH; + if (MEM_STATUS_AT0(reg->push_mem_start, node->m.regnum)) + len = OPSIZE_MEM_START_PUSH; else - len = SIZE_OP_MEMORY_START; + len = OPSIZE_MEM_START; - len += tlen + (MEM_STATUS_AT0(reg->bt_mem_end, node->m.regnum) - ? SIZE_OP_MEMORY_END_PUSH : SIZE_OP_MEMORY_END); + len += tlen + (MEM_STATUS_AT0(reg->push_mem_end, node->m.regnum) + ? OPSIZE_MEM_END_PUSH : OPSIZE_MEM_END); } break; @@ -1303,10 +1423,10 @@ compile_length_bag_node(BagNode* node, regex_t* reg) v = onig_positive_int_multiply(qn->lower, tlen); if (v < 0) return ONIGERR_TOO_BIG_NUMBER_FOR_REPEAT_RANGE; - len = v + SIZE_OP_PUSH + tlen + SIZE_OP_POP_OUT + SIZE_OP_JUMP; + len = v + OPSIZE_PUSH + tlen + OPSIZE_POP_OUT + OPSIZE_JUMP; } else { - len = SIZE_OP_ATOMIC_START + tlen + SIZE_OP_ATOMIC_END; + len = OPSIZE_ATOMIC_START + tlen + OPSIZE_ATOMIC_END; } break; @@ -1318,8 +1438,8 @@ compile_length_bag_node(BagNode* node, regex_t* reg) len = compile_length_tree(cond, reg); if (len < 0) return len; - len += SIZE_OP_PUSH; - len += SIZE_OP_ATOMIC_START + SIZE_OP_ATOMIC_END; + len += OPSIZE_PUSH; + len += OPSIZE_ATOMIC_START + OPSIZE_ATOMIC_END; if (IS_NOT_NULL(Then)) { tlen = compile_length_tree(Then, reg); @@ -1327,7 +1447,7 @@ compile_length_bag_node(BagNode* node, regex_t* reg) len += tlen; } - len += SIZE_OP_JUMP + SIZE_OP_ATOMIC_END; + len += OPSIZE_JUMP + OPSIZE_ATOMIC_END; if (IS_NOT_NULL(Else)) { tlen = compile_length_tree(Else, reg); @@ -1352,24 +1472,25 @@ static int compile_bag_memory_node(BagNode* node, regex_t* reg, ScanEnv* env) { int r; - int len; #ifdef USE_CALL if (NODE_IS_CALLED(node)) { + int len; + r = add_op(reg, OP_CALL); if (r != 0) return r; - node->m.called_addr = COP_CURR_OFFSET(reg) + 1 + SIZE_OP_JUMP; + node->m.called_addr = COP_CURR_OFFSET(reg) + 1 + OPSIZE_JUMP; NODE_STATUS_ADD(node, ADDR_FIXED); COP(reg)->call.addr = (int )node->m.called_addr; if (node->m.regnum == 0) { len = compile_length_tree(NODE_BAG_BODY(node), reg); - len += SIZE_OP_RETURN; + len += OPSIZE_RETURN; r = add_op(reg, OP_JUMP); if (r != 0) return r; - COP(reg)->jump.addr = len + SIZE_INC_OP; + COP(reg)->jump.addr = len + SIZE_INC; r = compile_tree(NODE_BAG_BODY(node), reg, env); if (r != 0) return r; @@ -1379,25 +1500,24 @@ compile_bag_memory_node(BagNode* node, regex_t* reg, ScanEnv* env) } else { len = compile_length_tree(NODE_BAG_BODY(node), reg); - len += (SIZE_OP_MEMORY_START_PUSH + SIZE_OP_RETURN); - if (MEM_STATUS_AT0(reg->bt_mem_end, node->m.regnum)) + len += (OPSIZE_MEM_START_PUSH + OPSIZE_RETURN); + if (MEM_STATUS_AT0(reg->push_mem_end, node->m.regnum)) len += (NODE_IS_RECURSION(node) - ? SIZE_OP_MEMORY_END_PUSH_REC : SIZE_OP_MEMORY_END_PUSH); + ? OPSIZE_MEM_END_PUSH_REC : OPSIZE_MEM_END_PUSH); else - len += (NODE_IS_RECURSION(node) - ? SIZE_OP_MEMORY_END_REC : SIZE_OP_MEMORY_END); + len += (NODE_IS_RECURSION(node) ? OPSIZE_MEM_END_REC : OPSIZE_MEM_END); r = add_op(reg, OP_JUMP); if (r != 0) return r; - COP(reg)->jump.addr = len + SIZE_INC_OP; + COP(reg)->jump.addr = len + SIZE_INC; } } #endif - if (MEM_STATUS_AT0(reg->bt_mem_start, node->m.regnum)) - r = add_op(reg, OP_MEMORY_START_PUSH); + if (MEM_STATUS_AT0(reg->push_mem_start, node->m.regnum)) + r = add_op(reg, OP_MEM_START_PUSH); else - r = add_op(reg, OP_MEMORY_START); + r = add_op(reg, OP_MEM_START); if (r != 0) return r; COP(reg)->memory_start.num = node->m.regnum; @@ -1405,11 +1525,11 @@ compile_bag_memory_node(BagNode* node, regex_t* reg, ScanEnv* env) if (r != 0) return r; #ifdef USE_CALL - if (MEM_STATUS_AT0(reg->bt_mem_end, node->m.regnum)) + if (MEM_STATUS_AT0(reg->push_mem_end, node->m.regnum)) r = add_op(reg, (NODE_IS_RECURSION(node) - ? OP_MEMORY_END_PUSH_REC : OP_MEMORY_END_PUSH)); + ? OP_MEM_END_PUSH_REC : OP_MEM_END_PUSH)); else - r = add_op(reg, (NODE_IS_RECURSION(node) ? OP_MEMORY_END_REC : OP_MEMORY_END)); + r = add_op(reg, (NODE_IS_RECURSION(node) ? OP_MEM_END_REC : OP_MEM_END)); if (r != 0) return r; COP(reg)->memory_end.num = node->m.regnum; @@ -1418,10 +1538,10 @@ compile_bag_memory_node(BagNode* node, regex_t* reg, ScanEnv* env) r = add_op(reg, OP_RETURN); } #else - if (MEM_STATUS_AT0(reg->bt_mem_end, node->m.regnum)) - r = add_op(reg, OP_MEMORY_END_PUSH); + if (MEM_STATUS_AT0(reg->push_mem_end, node->m.regnum)) + r = add_op(reg, OP_MEM_END_PUSH); else - r = add_op(reg, OP_MEMORY_END); + r = add_op(reg, OP_MEM_END); if (r != 0) return r; COP(reg)->memory_end.num = node->m.regnum; #endif @@ -1454,7 +1574,7 @@ compile_bag_node(BagNode* node, regex_t* reg, ScanEnv* env) r = add_op(reg, OP_PUSH); if (r != 0) return r; - COP(reg)->push.addr = SIZE_INC_OP + len + SIZE_OP_POP_OUT + SIZE_OP_JUMP; + COP(reg)->push.addr = SIZE_INC + len + OPSIZE_POP_OUT + OPSIZE_JUMP; r = compile_tree(NODE_QUANT_BODY(qn), reg, env); if (r != 0) return r; @@ -1463,7 +1583,7 @@ compile_bag_node(BagNode* node, regex_t* reg, ScanEnv* env) r = add_op(reg, OP_JUMP); if (r != 0) return r; - COP(reg)->jump.addr = -((int )SIZE_OP_PUSH + len + (int )SIZE_OP_POP_OUT); + COP(reg)->jump.addr = -((int )OPSIZE_PUSH + len + (int )OPSIZE_POP_OUT); } else { r = add_op(reg, OP_ATOMIC_START); @@ -1493,11 +1613,11 @@ compile_bag_node(BagNode* node, regex_t* reg, ScanEnv* env) else then_len = 0; - jump_len = cond_len + then_len + SIZE_OP_ATOMIC_END + SIZE_OP_JUMP; + jump_len = cond_len + then_len + OPSIZE_ATOMIC_END + OPSIZE_JUMP; r = add_op(reg, OP_PUSH); if (r != 0) return r; - COP(reg)->push.addr = SIZE_INC_OP + jump_len; + COP(reg)->push.addr = SIZE_INC + jump_len; r = compile_tree(cond, reg, env); if (r != 0) return r; @@ -1518,7 +1638,7 @@ compile_bag_node(BagNode* node, regex_t* reg, ScanEnv* env) r = add_op(reg, OP_JUMP); if (r != 0) return r; - COP(reg)->jump.addr = SIZE_OP_ATOMIC_END + else_len + SIZE_INC_OP; + COP(reg)->jump.addr = OPSIZE_ATOMIC_END + else_len + SIZE_INC; r = add_op(reg, OP_ATOMIC_END); if (r != 0) return r; @@ -1546,16 +1666,16 @@ compile_length_anchor_node(AnchorNode* node, regex_t* reg) switch (node->type) { case ANCR_PREC_READ: - len = SIZE_OP_PREC_READ_START + tlen + SIZE_OP_PREC_READ_END; + len = OPSIZE_PREC_READ_START + tlen + OPSIZE_PREC_READ_END; break; case ANCR_PREC_READ_NOT: - len = SIZE_OP_PREC_READ_NOT_START + tlen + SIZE_OP_PREC_READ_NOT_END; + len = OPSIZE_PREC_READ_NOT_START + tlen + OPSIZE_PREC_READ_NOT_END; break; case ANCR_LOOK_BEHIND: - len = SIZE_OP_LOOK_BEHIND + tlen; + len = OPSIZE_LOOK_BEHIND + tlen; break; case ANCR_LOOK_BEHIND_NOT: - len = SIZE_OP_LOOK_BEHIND_NOT_START + tlen + SIZE_OP_LOOK_BEHIND_NOT_END; + len = OPSIZE_LOOK_BEHIND_NOT_START + tlen + OPSIZE_LOOK_BEHIND_NOT_END; break; case ANCR_WORD_BOUNDARY: @@ -1564,7 +1684,7 @@ compile_length_anchor_node(AnchorNode* node, regex_t* reg) case ANCR_WORD_BEGIN: case ANCR_WORD_END: #endif - len = SIZE_OP_WORD_BOUNDARY; + len = OPSIZE_WORD_BOUNDARY; break; case ANCR_TEXT_SEGMENT_BOUNDARY: @@ -1648,7 +1768,7 @@ compile_anchor_node(AnchorNode* node, regex_t* reg, ScanEnv* env) r = add_op(reg, OP_PREC_READ_NOT_START); if (r != 0) return r; - COP(reg)->prec_read_not_start.addr = SIZE_INC_OP + len + SIZE_OP_PREC_READ_NOT_END; + COP(reg)->prec_read_not_start.addr = SIZE_INC + len + OPSIZE_PREC_READ_NOT_END; r = compile_tree(NODE_ANCHOR_BODY(node), reg, env); if (r != 0) return r; r = add_op(reg, OP_PREC_READ_NOT_END); @@ -1678,7 +1798,7 @@ compile_anchor_node(AnchorNode* node, regex_t* reg, ScanEnv* env) len = compile_length_tree(NODE_ANCHOR_BODY(node), reg); r = add_op(reg, OP_LOOK_BEHIND_NOT_START); if (r != 0) return r; - COP(reg)->look_behind_not_start.addr = SIZE_INC_OP + len + SIZE_OP_LOOK_BEHIND_NOT_END; + COP(reg)->look_behind_not_start.addr = SIZE_INC + len + OPSIZE_LOOK_BEHIND_NOT_END; if (node->char_len < 0) { r = get_char_len_node(NODE_ANCHOR_BODY(node), reg, &n); @@ -1764,25 +1884,25 @@ compile_length_gimmick_node(GimmickNode* node, regex_t* reg) switch (node->type) { case GIMMICK_FAIL: - len = SIZE_OP_FAIL; + len = OPSIZE_FAIL; break; case GIMMICK_SAVE: - len = SIZE_OP_PUSH_SAVE_VAL; + len = OPSIZE_PUSH_SAVE_VAL; break; case GIMMICK_UPDATE_VAR: - len = SIZE_OP_UPDATE_VAR; + len = OPSIZE_UPDATE_VAR; break; #ifdef USE_CALLOUT case GIMMICK_CALLOUT: switch (node->detail_type) { case ONIG_CALLOUT_OF_CONTENTS: - len = SIZE_OP_CALLOUT_CONTENTS; + len = OPSIZE_CALLOUT_CONTENTS; break; case ONIG_CALLOUT_OF_NAME: - len = SIZE_OP_CALLOUT_NAME; + len = OPSIZE_CALLOUT_NAME; break; default: @@ -1821,13 +1941,13 @@ compile_length_tree(Node* node, regex_t* reg) r += compile_length_tree(NODE_CAR(node), reg); n++; } while (IS_NOT_NULL(node = NODE_CDR(node))); - r += (SIZE_OP_PUSH + SIZE_OP_JUMP) * (n - 1); + r += (OPSIZE_PUSH + OPSIZE_JUMP) * (n - 1); } break; case NODE_STRING: - if (NODE_STRING_IS_RAW(node)) - r = compile_length_string_raw_node(STR_(node), reg); + if (NODE_STRING_IS_CRUDE(node)) + r = compile_length_string_crude_node(STR_(node), reg); else r = compile_length_string_node(node, reg); break; @@ -1841,12 +1961,12 @@ compile_length_tree(Node* node, regex_t* reg) break; case NODE_BACKREF: - r = SIZE_OP_BACKREF; + r = OPSIZE_BACKREF; break; #ifdef USE_CALL case NODE_CALL: - r = SIZE_OP_CALL; + r = OPSIZE_CALL; break; #endif @@ -1893,7 +2013,7 @@ compile_tree(Node* node, regex_t* reg, ScanEnv* env) do { len += compile_length_tree(NODE_CAR(x), reg); if (IS_NOT_NULL(NODE_CDR(x))) { - len += SIZE_OP_PUSH + SIZE_OP_JUMP; + len += OPSIZE_PUSH + OPSIZE_JUMP; } } while (IS_NOT_NULL(x = NODE_CDR(x))); pos = COP_CURR_OFFSET(reg) + 1 + len; /* goal position */ @@ -1904,7 +2024,7 @@ compile_tree(Node* node, regex_t* reg, ScanEnv* env) enum OpCode push = NODE_IS_SUPER(node) ? OP_PUSH_SUPER : OP_PUSH; r = add_op(reg, push); if (r != 0) break; - COP(reg)->push.addr = SIZE_INC_OP + len + SIZE_OP_JUMP; + COP(reg)->push.addr = SIZE_INC + len + OPSIZE_JUMP; } r = compile_tree(NODE_CAR(node), reg, env); if (r != 0) break; @@ -1919,8 +2039,8 @@ compile_tree(Node* node, regex_t* reg, ScanEnv* env) break; case NODE_STRING: - if (NODE_STRING_IS_RAW(node)) - r = compile_string_raw_node(STR_(node), reg); + if (NODE_STRING_IS_CRUDE(node)) + r = compile_string_crude_node(STR_(node), reg); else r = compile_string_node(node, reg); break; @@ -2090,8 +2210,9 @@ noname_disable_map(Node** plink, GroupNumRemap* map, int* counter) Node** ptarget = &(NODE_BODY(node)); Node* old = *ptarget; r = noname_disable_map(ptarget, map, counter); + if (r != 0) return r; if (*ptarget != old && NODE_TYPE(*ptarget) == NODE_QUANT) { - onig_reduce_nested_quantifier(node, *ptarget); + r = onig_reduce_nested_quantifier(node); } } break; @@ -2303,11 +2424,11 @@ disable_noname_group_capture(Node** root, regex_t* reg, ScanEnv* env) } } - loc = env->capture_history; - MEM_STATUS_CLEAR(env->capture_history); + loc = env->cap_history; + MEM_STATUS_CLEAR(env->cap_history); for (i = 1; i <= ONIG_MAX_CAPTURE_HISTORY_GROUP; i++) { if (MEM_STATUS_AT(loc, i)) { - MEM_STATUS_ON_SIMPLE(env->capture_history, map[i].new_val); + MEM_STATUS_ON_SIMPLE(env->cap_history, map[i].new_val); } } @@ -2683,7 +2804,7 @@ is_exclusive(Node* x, Node* y, regex_t* reg) len = NODE_STRING_LEN(x); if (len > NODE_STRING_LEN(y)) len = NODE_STRING_LEN(y); - if (NODE_STRING_IS_AMBIG(x) || NODE_STRING_IS_AMBIG(y)) { + if (NODE_STRING_IS_CASE_FOLD_MATCH(x) || NODE_STRING_IS_CASE_FOLD_MATCH(y)) { /* tiny version */ return 0; } @@ -2743,7 +2864,7 @@ get_head_value_node(Node* node, int exact, regex_t* reg) break; if (exact == 0 || - ! IS_IGNORECASE(reg->options) || NODE_STRING_IS_RAW(node)) { + ! IS_IGNORECASE(reg->options) || NODE_STRING_IS_CRUDE(node)) { n = node; } } @@ -2871,9 +2992,9 @@ tree_min_len(Node* node, ScanEnv* env) if (NODE_IS_RECURSION(node)) break; backs = BACKREFS_P(br); - len = tree_min_len(mem_env[backs[0]].node, env); + len = tree_min_len(mem_env[backs[0]].mem_node, env); for (i = 1; i < br->back_num; i++) { - tmin = tree_min_len(mem_env[backs[i]].node, env); + tmin = tree_min_len(mem_env[backs[i]].mem_node, env); if (len > tmin) len = tmin; } } @@ -3042,7 +3163,7 @@ tree_max_len(Node* node, ScanEnv* env) } backs = BACKREFS_P(br); for (i = 0; i < br->back_num; i++) { - tmax = tree_max_len(mem_env[backs[i]].node, env); + tmax = tree_max_len(mem_env[backs[i]].mem_node, env); if (len < tmax) len = tmax; } } @@ -3179,7 +3300,7 @@ check_backrefs(Node* node, ScanEnv* env) if (backs[i] > env->num_mem) return ONIGERR_INVALID_BACKREF; - NODE_STATUS_ADD(mem_env[backs[i]].node, BACKREF); + NODE_STATUS_ADD(mem_env[backs[i]].mem_node, BACKREF); } r = 0; } @@ -3193,6 +3314,204 @@ check_backrefs(Node* node, ScanEnv* env) return r; } +static int +set_empty_repeat_node_trav(Node* node, Node* empty, ScanEnv* env) +{ + int r; + + switch (NODE_TYPE(node)) { + case NODE_LIST: + case NODE_ALT: + do { + r = set_empty_repeat_node_trav(NODE_CAR(node), empty, env); + } while (r == 0 && IS_NOT_NULL(node = NODE_CDR(node))); + break; + + case NODE_ANCHOR: + { + AnchorNode* an = ANCHOR_(node); + + if (! ANCHOR_HAS_BODY(an)) { + r = 0; + break; + } + + switch (an->type) { + case ANCR_PREC_READ: + case ANCR_LOOK_BEHIND: + empty = NULL_NODE; + break; + default: + break; + } + r = set_empty_repeat_node_trav(NODE_BODY(node), empty, env); + } + break; + + case NODE_QUANT: + { + QuantNode* qn = QUANT_(node); + + if (qn->emptiness != BODY_IS_NOT_EMPTY) empty = node; + r = set_empty_repeat_node_trav(NODE_BODY(node), empty, env); + } + break; + + case NODE_BAG: + if (IS_NOT_NULL(NODE_BODY(node))) { + r = set_empty_repeat_node_trav(NODE_BODY(node), empty, env); + if (r != 0) return r; + } + { + BagNode* en = BAG_(node); + + if (en->type == BAG_MEMORY) { + if (NODE_IS_BACKREF(node)) { + if (IS_NOT_NULL(empty)) + SCANENV_MEMENV(env)[en->m.regnum].empty_repeat_node = empty; + } + } + else if (en->type == BAG_IF_ELSE) { + if (IS_NOT_NULL(en->te.Then)) { + r = set_empty_repeat_node_trav(en->te.Then, empty, env); + if (r != 0) return r; + } + if (IS_NOT_NULL(en->te.Else)) { + r = set_empty_repeat_node_trav(en->te.Else, empty, env); + } + } + } + break; + + default: + r = 0; + break; + } + + return r; +} + +static int +is_ancestor_node(Node* node, Node* me) +{ + Node* parent; + + while ((parent = NODE_PARENT(me)) != NULL_NODE) { + if (parent == node) return 1; + me = parent; + } + return 0; +} + +static void +set_empty_status_check_trav(Node* node, ScanEnv* env) +{ + switch (NODE_TYPE(node)) { + case NODE_LIST: + case NODE_ALT: + do { + set_empty_status_check_trav(NODE_CAR(node), env); + } while (IS_NOT_NULL(node = NODE_CDR(node))); + break; + + case NODE_ANCHOR: + { + AnchorNode* an = ANCHOR_(node); + + if (! ANCHOR_HAS_BODY(an)) break; + set_empty_status_check_trav(NODE_BODY(node), env); + } + break; + + case NODE_QUANT: + set_empty_status_check_trav(NODE_BODY(node), env); + break; + + case NODE_BAG: + if (IS_NOT_NULL(NODE_BODY(node))) + set_empty_status_check_trav(NODE_BODY(node), env); + { + BagNode* en = BAG_(node); + + if (en->type == BAG_IF_ELSE) { + if (IS_NOT_NULL(en->te.Then)) { + set_empty_status_check_trav(en->te.Then, env); + } + if (IS_NOT_NULL(en->te.Else)) { + set_empty_status_check_trav(en->te.Else, env); + } + } + } + break; + + case NODE_BACKREF: + { + int i; + int* backs; + MemEnv* mem_env = SCANENV_MEMENV(env); + BackRefNode* br = BACKREF_(node); + backs = BACKREFS_P(br); + for (i = 0; i < br->back_num; i++) { + Node* ernode = mem_env[backs[i]].empty_repeat_node; + if (IS_NOT_NULL(ernode)) { + if (! is_ancestor_node(ernode, node)) { + MEM_STATUS_LIMIT_ON(env->reg->empty_status_mem, backs[i]); + NODE_STATUS_ADD(ernode, EMPTY_STATUS_CHECK); + NODE_STATUS_ADD(mem_env[backs[i]].mem_node, EMPTY_STATUS_CHECK); + } + } + } + } + break; + + default: + break; + } +} + +static void +set_parent_node_trav(Node* node, Node* parent) +{ + NODE_PARENT(node) = parent; + + switch (NODE_TYPE(node)) { + case NODE_LIST: + case NODE_ALT: + do { + set_parent_node_trav(NODE_CAR(node), node); + } while (IS_NOT_NULL(node = NODE_CDR(node))); + break; + + case NODE_ANCHOR: + if (! ANCHOR_HAS_BODY(ANCHOR_(node))) break; + set_parent_node_trav(NODE_BODY(node), node); + break; + + case NODE_QUANT: + set_parent_node_trav(NODE_BODY(node), node); + break; + + case NODE_BAG: + if (IS_NOT_NULL(NODE_BODY(node))) + set_parent_node_trav(NODE_BODY(node), node); + { + BagNode* en = BAG_(node); + + if (en->type == BAG_IF_ELSE) { + if (IS_NOT_NULL(en->te.Then)) + set_parent_node_trav(en->te.Then, node); + if (IS_NOT_NULL(en->te.Else)) { + set_parent_node_trav(en->te.Else, node); + } + } + } + break; + + default: + break; + } +} + #ifdef USE_CALL @@ -3298,6 +3617,9 @@ infinite_recursive_call_check(Node* node, ScanEnv* env, int head) if ((eret & RECURSION_MUST) == 0) r &= ~RECURSION_MUST; } + else { + r &= ~RECURSION_MUST; + } } else { r = infinite_recursive_call_check(NODE_BODY(node), env, head); @@ -3472,7 +3794,7 @@ recursive_call_check_trav(Node* node, ScanEnv* env, int state) r = recursive_call_check_trav(NODE_BODY(node), env, state); if (QUANT_(node)->upper == 0) { if (r == FOUND_CALLED_NODE) - QUANT_(node)->is_refered = 1; + QUANT_(node)->include_referred = 1; } break; @@ -3495,34 +3817,127 @@ recursive_call_check_trav(Node* node, ScanEnv* env, int state) if (! NODE_IS_RECURSION(node)) { NODE_STATUS_ADD(node, MARK1); r = recursive_call_check(NODE_BODY(node)); - if (r != 0) + if (r != 0) { NODE_STATUS_ADD(node, RECURSION); + MEM_STATUS_ON(env->backtrack_mem, en->m.regnum); + } NODE_STATUS_REMOVE(node, MARK1); } - if (NODE_IS_CALLED(node)) - r = FOUND_CALLED_NODE; - } - } + if (NODE_IS_CALLED(node)) + r = FOUND_CALLED_NODE; + } + } + + state1 = state; + if (NODE_IS_RECURSION(node)) + state1 |= IN_RECURSION; + + ret = recursive_call_check_trav(NODE_BODY(node), env, state1); + if (ret == FOUND_CALLED_NODE) + r = FOUND_CALLED_NODE; + + if (en->type == BAG_IF_ELSE) { + if (IS_NOT_NULL(en->te.Then)) { + ret = recursive_call_check_trav(en->te.Then, env, state1); + if (ret == FOUND_CALLED_NODE) + r = FOUND_CALLED_NODE; + } + if (IS_NOT_NULL(en->te.Else)) { + ret = recursive_call_check_trav(en->te.Else, env, state1); + if (ret == FOUND_CALLED_NODE) + r = FOUND_CALLED_NODE; + } + } + } + break; + + default: + break; + } + + return r; +} + +#endif + +static void +remove_from_list(Node* prev, Node* a) +{ + if (NODE_CDR(prev) != a) return ; + + NODE_CDR(prev) = NODE_CDR(a); + NODE_CDR(a) = NULL_NODE; +} + +static int +reduce_string_list(Node* node) +{ + int r = 0; + + switch (NODE_TYPE(node)) { + case NODE_LIST: + { + Node* prev; + Node* curr; + Node* prev_node; + Node* next_node; + + prev = NULL_NODE; + do { + next_node = NODE_CDR(node); + curr = NODE_CAR(node); + if (NODE_TYPE(curr) == NODE_STRING) { + if (IS_NULL(prev) || STR_(curr)->flag != STR_(prev)->flag) { + prev = curr; + prev_node = node; + } + else { + r = node_str_node_cat(prev, curr); + if (r != 0) return r; + remove_from_list(prev_node, node); + onig_node_free(node); + } + } + else { + prev = NULL_NODE; + prev_node = node; + } + + node = next_node; + } while (r == 0 && IS_NOT_NULL(node)); + } + break; + + case NODE_ALT: + do { + r = reduce_string_list(NODE_CAR(node)); + } while (r == 0 && IS_NOT_NULL(node = NODE_CDR(node))); + break; + + case NODE_ANCHOR: + if (IS_NULL(NODE_BODY(node))) + break; + /* fall */ + case NODE_QUANT: + r = reduce_string_list(NODE_BODY(node)); + break; - state1 = state; - if (NODE_IS_RECURSION(node)) - state1 |= IN_RECURSION; + case NODE_BAG: + { + BagNode* en = BAG_(node); - ret = recursive_call_check_trav(NODE_BODY(node), env, state1); - if (ret == FOUND_CALLED_NODE) - r = FOUND_CALLED_NODE; + r = reduce_string_list(NODE_BODY(node)); + if (r != 0) return r; if (en->type == BAG_IF_ELSE) { if (IS_NOT_NULL(en->te.Then)) { - ret = recursive_call_check_trav(en->te.Then, env, state1); - if (ret == FOUND_CALLED_NODE) - r = FOUND_CALLED_NODE; + r = reduce_string_list(en->te.Then); + if (r != 0) return r; } if (IS_NOT_NULL(en->te.Else)) { - ret = recursive_call_check_trav(en->te.Else, env, state1); - if (ret == FOUND_CALLED_NODE) - r = FOUND_CALLED_NODE; + r = reduce_string_list(en->te.Else); + if (r != 0) return r; } } } @@ -3535,7 +3950,6 @@ recursive_call_check_trav(Node* node, ScanEnv* env, int state) return r; } -#endif #define IN_ALT (1<<0) #define IN_NOT (1<<1) @@ -3559,7 +3973,7 @@ divide_look_behind_alternatives(Node* node) head = NODE_ANCHOR_BODY(an); np = NODE_CAR(head); - swap_node(node, head); + node_swap(node, head); NODE_CAR(node) = head; NODE_BODY(head) = np; @@ -3581,7 +3995,7 @@ divide_look_behind_alternatives(Node* node) } static int -setup_look_behind(Node* node, regex_t* reg, ScanEnv* env) +tune_look_behind(Node* node, regex_t* reg, ScanEnv* env) { int r, len; AnchorNode* an = ANCHOR_(node); @@ -3602,7 +4016,7 @@ setup_look_behind(Node* node, regex_t* reg, ScanEnv* env) } static int -next_setup(Node* node, Node* next_node, regex_t* reg) +tune_next(Node* node, Node* next_node, regex_t* reg) { NodeType type; @@ -3629,7 +4043,7 @@ next_setup(Node* node, Node* next_node, regex_t* reg) Node* en = onig_node_new_bag(BAG_STOP_BACKTRACK); CHECK_NULL_RETURN_MEMERR(en); NODE_STATUS_ADD(en, STRICT_REAL_REPEAT); - swap_node(node, en); + node_swap(node, en); NODE_BODY(node) = en; } } @@ -3649,23 +4063,57 @@ next_setup(Node* node, Node* next_node, regex_t* reg) static int -update_string_node_case_fold(regex_t* reg, Node *node) +is_all_code_len_1_items(int n, OnigCaseFoldCodeItem items[]) { - UChar *p, *end, buf[ONIGENC_MBC_CASE_FOLD_MAXLEN]; + int i; + + for (i = 0; i < n; i++) { + OnigCaseFoldCodeItem* item = items + i; + if (item->code_len != 1) return 0; + } + + return 1; +} + +static int +get_min_max_byte_len_case_fold_items(int n, OnigCaseFoldCodeItem items[], int* rmin, int* rmax) +{ + int i, len, minlen, maxlen; + + minlen = INT_MAX; + maxlen = 0; + for (i = 0; i < n; i++) { + OnigCaseFoldCodeItem* item = items + i; + + len = item->byte_len; + if (len < minlen) minlen = len; + if (len > maxlen) maxlen = len; + } + + *rmin = minlen; + *rmax = maxlen; + return 0; +} + +static int +conv_string_case_fold(OnigEncoding enc, OnigCaseFoldType case_fold_flag, + UChar* s, UChar* end, UChar** rs, UChar** rend, int* rcase_min_len) +{ + UChar *p, buf[ONIGENC_MBC_CASE_FOLD_MAXLEN]; UChar *sbuf, *ebuf, *sp; - int r, i, len, sbuf_size; - StrNode* sn = STR_(node); + int i, n, len, sbuf_size; - end = sn->end; - sbuf_size = (int )(end - sn->s) * 2; + *rs = NULL; + sbuf_size = (int )(end - s) * 2; sbuf = (UChar* )xmalloc(sbuf_size); CHECK_NULL_RETURN_MEMERR(sbuf); ebuf = sbuf + sbuf_size; + n = 0; sp = sbuf; - p = sn->s; + p = s; while (p < end) { - len = ONIGENC_MBC_CASE_FOLD(reg->enc, reg->case_fold_flag, &p, end, buf); + len = ONIGENC_MBC_CASE_FOLD(enc, case_fold_flag, &p, end, buf); for (i = 0; i < len; i++) { if (sp >= ebuf) { sbuf = (UChar* )xrealloc(sbuf, sbuf_size * 2); @@ -3677,356 +4125,302 @@ update_string_node_case_fold(regex_t* reg, Node *node) *sp++ = buf[i]; } + n++; } - r = onig_node_str_set(node, sbuf, sp); - if (r != 0) { - xfree(sbuf); - return r; - } - - xfree(sbuf); + *rs = sbuf; + *rend = sp; + *rcase_min_len = n; return 0; } static int -expand_case_fold_make_rem_string(Node** rnode, UChar *s, UChar *end, regex_t* reg) +make_code_list_to_string(Node** rnode, OnigEncoding enc, + int n, OnigCodePoint codes[]) { - int r; - Node *node; + int r, i, len; + Node* node; + UChar buf[ONIGENC_CODE_TO_MBC_MAXLEN]; - node = onig_node_new_str(s, end); - if (IS_NULL(node)) return ONIGERR_MEMORY; + *rnode = NULL_NODE; + node = onig_node_new_str(NULL, NULL); + CHECK_NULL_RETURN_MEMERR(node); - r = update_string_node_case_fold(reg, node); - if (r != 0) { - onig_node_free(node); - return r; + for (i = 0; i < n; i++) { + len = ONIGENC_CODE_TO_MBC(enc, codes[i], buf); + if (len < 0) { + r = len; + goto err; + } + + r = onig_node_str_cat(node, buf, buf + len); + if (r != 0) goto err; } - NODE_STRING_SET_AMBIG(node); - NODE_STRING_SET_DONT_GET_OPT_INFO(node); *rnode = node; return 0; + + err: + onig_node_free(node); + return r; } static int -expand_case_fold_string_alt(int item_num, OnigCaseFoldCodeItem items[], UChar *p, - int slen, UChar *end, regex_t* reg, Node **rnode) +unravel_cf_node_add(Node** rlist, Node* add) { - int r, i, j; - int len; - int varlen; - Node *anode, *var_anode, *snode, *xnode, *an; - UChar buf[ONIGENC_CODE_TO_MBC_MAXLEN]; - - *rnode = var_anode = NULL_NODE; + Node *list; - varlen = 0; - for (i = 0; i < item_num; i++) { - if (items[i].byte_len != slen) { - varlen = 1; - break; - } + list = *rlist; + if (IS_NULL(list)) { + list = onig_node_new_list(add, NULL); + CHECK_NULL_RETURN_MEMERR(list); + *rlist = list; } + else { + Node* r = node_list_add(list, add); + CHECK_NULL_RETURN_MEMERR(r); + } + + return 0; +} - if (varlen != 0) { - *rnode = var_anode = onig_node_new_alt(NULL_NODE, NULL_NODE); - if (IS_NULL(var_anode)) return ONIGERR_MEMORY; +static int +unravel_cf_string_add(Node** rlist, Node** rsn, UChar* s, UChar* end, + unsigned int flag, int case_min_len) +{ + int r; + Node *sn, *list; - xnode = onig_node_new_list(NULL, NULL); - if (IS_NULL(xnode)) goto mem_err; - NODE_CAR(var_anode) = xnode; + list = *rlist; + sn = *rsn; - anode = onig_node_new_alt(NULL_NODE, NULL_NODE); - if (IS_NULL(anode)) goto mem_err; - NODE_CAR(xnode) = anode; + if (IS_NOT_NULL(sn) && STR_(sn)->flag == flag) { + if (NODE_STRING_IS_CASE_FOLD_MATCH(sn)) + r = node_str_cat_case_fold(sn, s, end, case_min_len); + else + r = onig_node_str_cat(sn, s, end); } else { - *rnode = anode = onig_node_new_alt(NULL_NODE, NULL_NODE); - if (IS_NULL(anode)) return ONIGERR_MEMORY; + sn = onig_node_new_str(s, end); + CHECK_NULL_RETURN_MEMERR(sn); + + STR_(sn)->flag = flag; + STR_(sn)->case_min_len = case_min_len; + r = unravel_cf_node_add(&list, sn); } - snode = onig_node_new_str(p, p + slen); - if (IS_NULL(snode)) goto mem_err; + if (r == 0) { + *rlist = list; + *rsn = sn; + } + return r; +} - NODE_CAR(anode) = snode; +static int +unravel_cf_string_fold_add(Node** rlist, Node** rsn, OnigEncoding enc, + OnigCaseFoldType case_fold_flag, UChar* s, UChar* end) +{ + int r; + int case_min_len; + UChar *rs, *rend; - for (i = 0; i < item_num; i++) { - snode = onig_node_new_str(NULL, NULL); - if (IS_NULL(snode)) goto mem_err; + r = conv_string_case_fold(enc, case_fold_flag, s, end, + &rs, &rend, &case_min_len); + if (r != 0) return r; - for (j = 0; j < items[i].code_len; j++) { - len = ONIGENC_CODE_TO_MBC(reg->enc, items[i].code[j], buf); - if (len < 0) { - r = len; - goto mem_err2; - } + r = unravel_cf_string_add(rlist, rsn, rs, rend, + NODE_STRING_CASE_FOLD_MATCH, case_min_len); + xfree(rs); - r = onig_node_str_cat(snode, buf, buf + len); - if (r != 0) goto mem_err2; - } + return r; +} - an = onig_node_new_alt(NULL_NODE, NULL_NODE); - if (IS_NULL(an)) { - goto mem_err2; - } +static int +unravel_cf_string_alt_or_cc_add(Node** rlist, int n, + OnigCaseFoldCodeItem items[], int byte_len, OnigEncoding enc, + OnigCaseFoldType case_fold_flag, UChar* s, UChar* end) +{ + int r, i; + Node* node; - if (items[i].byte_len != slen && IS_NOT_NULL(var_anode)) { - Node *rem; - UChar *q = p + items[i].byte_len; + if (is_all_code_len_1_items(n, items)) { + OnigCodePoint codes[14];/* least ONIGENC_GET_CASE_FOLD_CODES_MAX_NUM + 1 */ - if (q < end) { - r = expand_case_fold_make_rem_string(&rem, q, end, reg); - if (r != 0) { - onig_node_free(an); - goto mem_err2; - } + codes[0] = ONIGENC_MBC_TO_CODE(enc, s, end); + for (i = 0; i < n; i++) { + OnigCaseFoldCodeItem* item = items + i; + codes[i+1] = item->code[0]; + } + r = onig_new_cclass_with_code_list(&node, enc, n + 1, codes); + if (r != 0) return r; + } + else { + Node *snode, *alt, *curr; - xnode = onig_node_list_add(NULL_NODE, snode); - if (IS_NULL(xnode)) { - onig_node_free(an); - onig_node_free(rem); - goto mem_err2; - } - if (IS_NULL(onig_node_list_add(xnode, rem))) { - onig_node_free(an); - onig_node_free(xnode); - onig_node_free(rem); - goto mem_err; - } + snode = onig_node_new_str(s, end); + CHECK_NULL_RETURN_MEMERR(snode); + node = curr = onig_node_new_alt(snode, NULL_NODE); + if (IS_NULL(curr)) { + onig_node_free(snode); + return ONIGERR_MEMORY; + } - NODE_CAR(an) = xnode; + r = 0; + for (i = 0; i < n; i++) { + OnigCaseFoldCodeItem* item = items + i; + r = make_code_list_to_string(&snode, enc, item->code_len, item->code); + if (r != 0) { + onig_node_free(node); + return r; } - else { - NODE_CAR(an) = snode; + + alt = onig_node_new_alt(snode, NULL_NODE); + if (IS_NULL(alt)) { + onig_node_free(snode); + onig_node_free(node); + return ONIGERR_MEMORY; } - NODE_CDR(var_anode) = an; - var_anode = an; - } - else { - NODE_CAR(an) = snode; - NODE_CDR(anode) = an; - anode = an; + NODE_CDR(curr) = alt; + curr = alt; } } - return varlen; - - mem_err2: - onig_node_free(snode); - - mem_err: - onig_node_free(*rnode); - - return ONIGERR_MEMORY; + r = unravel_cf_node_add(rlist, node); + if (r != 0) onig_node_free(node); + return r; } static int -is_good_case_fold_items_for_search(OnigEncoding enc, int slen, - int n, OnigCaseFoldCodeItem items[]) +unravel_cf_look_behind_add(Node** rlist, Node** rsn, + int n, OnigCaseFoldCodeItem items[], OnigEncoding enc, + UChar* s, int one_len) { - int i, len; - UChar buf[ONIGENC_MBC_CASE_FOLD_MAXLEN]; + int r, i, found; + found = 0; for (i = 0; i < n; i++) { OnigCaseFoldCodeItem* item = items + i; + if (item->byte_len == one_len) { + if (item->code_len == 1) { + found = 1; + } + } + } - if (item->code_len != 1) return 0; - if (item->byte_len != slen) return 0; - len = ONIGENC_CODE_TO_MBC(enc, item->code[0], buf); - if (len != slen) return 0; + if (found == 0) { + r = unravel_cf_string_add(rlist, rsn, s, s + one_len, 0 /* flag */, 0); } + else { + Node* node; + OnigCodePoint codes[14];/* least ONIGENC_GET_CASE_FOLD_CODES_MAX_NUM + 1 */ - return 1; -} + found = 0; + codes[found++] = ONIGENC_MBC_TO_CODE(enc, s, s + one_len); + for (i = 0; i < n; i++) { + OnigCaseFoldCodeItem* item = items + i; + if (item->byte_len == one_len) { + if (item->code_len == 1) { + codes[found++] = item->code[0]; + } + } + } + r = onig_new_cclass_with_code_list(&node, enc, found, codes); + if (r != 0) return r; + + r = unravel_cf_node_add(rlist, node); + if (r != 0) onig_node_free(node); + + *rsn = NULL_NODE; + } -#define THRESHOLD_CASE_FOLD_ALT_FOR_EXPANSION 8 + return r; +} static int -expand_case_fold_string(Node* node, regex_t* reg, int state) -{ - int r, n, len, alt_num; - int fold_len; - int prev_is_ambig, prev_is_good, is_good, is_in_look_behind; - UChar *start, *end, *p; - UChar* foldp; - Node *top_root, *root, *snode, *prev_node; +unravel_case_fold_string(Node* node, regex_t* reg, int state) +{ + int r, n, one_len, min_len, max_len, in_look_behind; + UChar *start, *end, *p, *q; + StrNode* snode; + Node *sn, *list; + OnigEncoding enc; OnigCaseFoldCodeItem items[ONIGENC_GET_CASE_FOLD_CODES_MAX_NUM]; - UChar buf[ONIGENC_MBC_CASE_FOLD_MAXLEN]; - StrNode* sn; - if (NODE_STRING_IS_AMBIG(node)) return 0; + if (NODE_STRING_IS_CASE_EXPANDED(node)) return 0; - sn = STR_(node); + snode = STR_(node); - start = sn->s; - end = sn->end; + start = snode->s; + end = snode->end; if (start >= end) return 0; - is_in_look_behind = (state & IN_LOOK_BEHIND) != 0; + in_look_behind = (state & IN_LOOK_BEHIND) != 0; + enc = reg->enc; - r = 0; - top_root = root = prev_node = snode = NULL_NODE; - alt_num = 1; + list = sn = NULL_NODE; p = start; while (p < end) { - n = ONIGENC_GET_CASE_FOLD_CODES_BY_STR(reg->enc, reg->case_fold_flag, - p, end, items); + n = ONIGENC_GET_CASE_FOLD_CODES_BY_STR(enc, reg->case_fold_flag, p, end, + items); if (n < 0) { r = n; goto err; } - len = enclen(reg->enc, p); - is_good = is_good_case_fold_items_for_search(reg->enc, len, n, items); - - if (is_in_look_behind || - (IS_NOT_NULL(snode) || - (is_good - /* expand single char case: ex. /(?i:a)/ */ - && !(p == start && p + len >= end)))) { - if (IS_NULL(snode)) { - if (IS_NULL(root) && IS_NOT_NULL(prev_node)) { - top_root = root = onig_node_list_add(NULL_NODE, prev_node); - if (IS_NULL(root)) { - onig_node_free(prev_node); - goto mem_err; - } - } - - prev_node = snode = onig_node_new_str(NULL, NULL); - if (IS_NULL(snode)) goto mem_err; - if (IS_NOT_NULL(root)) { - if (IS_NULL(onig_node_list_add(root, snode))) { - onig_node_free(snode); - goto mem_err; - } - } - - prev_is_ambig = -1; /* -1: new */ - prev_is_good = 0; /* escape compiler warning */ - } - else { - prev_is_ambig = NODE_STRING_IS_AMBIG(snode); - prev_is_good = NODE_STRING_IS_GOOD_AMBIG(snode); - } - - if (n != 0) { - foldp = p; - fold_len = ONIGENC_MBC_CASE_FOLD(reg->enc, reg->case_fold_flag, - &foldp, end, buf); - foldp = buf; - } - else { - foldp = p; fold_len = len; - } - - if ((prev_is_ambig == 0 && n != 0) || - (prev_is_ambig > 0 && (n == 0 || prev_is_good != is_good))) { - if (IS_NULL(root) /* && IS_NOT_NULL(prev_node) */) { - top_root = root = onig_node_list_add(NULL_NODE, prev_node); - if (IS_NULL(root)) { - onig_node_free(prev_node); - goto mem_err; - } - } - - prev_node = snode = onig_node_new_str(foldp, foldp + fold_len); - if (IS_NULL(snode)) goto mem_err; - if (IS_NULL(onig_node_list_add(root, snode))) { - onig_node_free(snode); - goto mem_err; - } - } - else { - r = onig_node_str_cat(snode, foldp, foldp + fold_len); - if (r != 0) goto err; - } - - if (n != 0) NODE_STRING_SET_AMBIG(snode); - if (is_good != 0) NODE_STRING_SET_GOOD_AMBIG(snode); + one_len = enclen(enc, p); + if (n == 0) { + q = p + one_len; + r = unravel_cf_string_add(&list, &sn, p, q, 0 /* flag */, 0); + if (r != 0) goto err; } else { - alt_num *= (n + 1); - if (alt_num > THRESHOLD_CASE_FOLD_ALT_FOR_EXPANSION) break; - - if (IS_NULL(root) && IS_NOT_NULL(prev_node)) { - top_root = root = onig_node_list_add(NULL_NODE, prev_node); - if (IS_NULL(root)) { - onig_node_free(prev_node); - goto mem_err; - } + if (in_look_behind != 0) { + q = p + one_len; + r = unravel_cf_look_behind_add(&list, &sn, n, items, enc, p, one_len); + if (r != 0) goto err; } - - r = expand_case_fold_string_alt(n, items, p, len, end, reg, &prev_node); - if (r < 0) goto mem_err; - if (r == 1) { - if (IS_NULL(root)) { - top_root = prev_node; + else { + get_min_max_byte_len_case_fold_items(n, items, &min_len, &max_len); + q = p + max_len; + if (one_len == max_len && min_len == max_len) { + r = unravel_cf_string_alt_or_cc_add(&list, n, items, max_len, enc, + reg->case_fold_flag, p, q); + if (r != 0) goto err; + sn = NULL_NODE; } else { - if (IS_NULL(onig_node_list_add(root, prev_node))) { - onig_node_free(prev_node); - goto mem_err; - } - } - - root = NODE_CAR(prev_node); - } - else { /* r == 0 */ - if (IS_NOT_NULL(root)) { - if (IS_NULL(onig_node_list_add(root, prev_node))) { - onig_node_free(prev_node); - goto mem_err; - } + r = unravel_cf_string_fold_add(&list, &sn, enc, reg->case_fold_flag, + p, q); + if (r != 0) goto err; } } - - snode = NULL_NODE; } - p += len; + p = q; } - if (p < end) { - Node *srem; - - r = expand_case_fold_make_rem_string(&srem, p, end, reg); - if (r != 0) goto mem_err; - - if (IS_NOT_NULL(prev_node) && IS_NULL(root)) { - top_root = root = onig_node_list_add(NULL_NODE, prev_node); - if (IS_NULL(root)) { - onig_node_free(srem); - onig_node_free(prev_node); - goto mem_err; - } - } - - if (IS_NULL(root)) { - prev_node = srem; + if (IS_NOT_NULL(list)) { + if (node_list_len(list) == 1) { + node_swap(node, NODE_CAR(list)); } else { - if (IS_NULL(onig_node_list_add(root, srem))) { - onig_node_free(srem); - goto mem_err; - } + node_swap(node, list); } + onig_node_free(list); + } + else { + node_swap(node, sn); + onig_node_free(sn); } - - /* ending */ - top_root = (IS_NOT_NULL(top_root) ? top_root : prev_node); - swap_node(node, top_root); - onig_node_free(top_root); return 0; - mem_err: - r = ONIGERR_MEMORY; - err: - onig_node_free(top_root); + if (IS_NOT_NULL(list)) + onig_node_free(list); + else if (IS_NOT_NULL(sn)) + onig_node_free(sn); + return r; } @@ -4121,7 +4515,7 @@ quantifiers_memory_node_info(Node* node) __inline #endif static int -setup_call_node_call(CallNode* cn, ScanEnv* env, int state) +tune_call_node_call(CallNode* cn, ScanEnv* env, int state) { MemEnv* mem_env = SCANENV_MEMENV(env); @@ -4141,7 +4535,7 @@ setup_call_node_call(CallNode* cn, ScanEnv* env, int state) } set_call_attr: - NODE_CALL_BODY(cn) = mem_env[cn->group_num].node; + NODE_CALL_BODY(cn) = mem_env[cn->group_num].mem_node; if (IS_NULL(NODE_CALL_BODY(cn))) { onig_scan_env_set_error_string(env, ONIGERR_UNDEFINED_NAME_REFERENCE, cn->name, cn->name_end); @@ -4172,23 +4566,23 @@ setup_call_node_call(CallNode* cn, ScanEnv* env, int state) } static void -setup_call2_call(Node* node) +tune_call2_call(Node* node) { switch (NODE_TYPE(node)) { case NODE_LIST: case NODE_ALT: do { - setup_call2_call(NODE_CAR(node)); + tune_call2_call(NODE_CAR(node)); } while (IS_NOT_NULL(node = NODE_CDR(node))); break; case NODE_QUANT: - setup_call2_call(NODE_BODY(node)); + tune_call2_call(NODE_BODY(node)); break; case NODE_ANCHOR: if (ANCHOR_HAS_BODY(ANCHOR_(node))) - setup_call2_call(NODE_BODY(node)); + tune_call2_call(NODE_BODY(node)); break; case NODE_BAG: @@ -4198,19 +4592,19 @@ setup_call2_call(Node* node) if (en->type == BAG_MEMORY) { if (! NODE_IS_MARK1(node)) { NODE_STATUS_ADD(node, MARK1); - setup_call2_call(NODE_BODY(node)); + tune_call2_call(NODE_BODY(node)); NODE_STATUS_REMOVE(node, MARK1); } } else if (en->type == BAG_IF_ELSE) { - setup_call2_call(NODE_BODY(node)); + tune_call2_call(NODE_BODY(node)); if (IS_NOT_NULL(en->te.Then)) - setup_call2_call(en->te.Then); + tune_call2_call(en->te.Then); if (IS_NOT_NULL(en->te.Else)) - setup_call2_call(en->te.Else); + tune_call2_call(en->te.Else); } else { - setup_call2_call(NODE_BODY(node)); + tune_call2_call(NODE_BODY(node)); } } break; @@ -4226,7 +4620,7 @@ setup_call2_call(Node* node) NODE_STATUS_ADD(called, CALLED); BAG_(called)->m.entry_count++; - setup_call2_call(called); + tune_call2_call(called); } NODE_STATUS_REMOVE(node, MARK1); } @@ -4238,7 +4632,7 @@ setup_call2_call(Node* node) } static int -setup_call(Node* node, ScanEnv* env, int state) +tune_call(Node* node, ScanEnv* env, int state) { int r; @@ -4246,7 +4640,7 @@ setup_call(Node* node, ScanEnv* env, int state) case NODE_LIST: case NODE_ALT: do { - r = setup_call(NODE_CAR(node), env, state); + r = tune_call(NODE_CAR(node), env, state); } while (r == 0 && IS_NOT_NULL(node = NODE_CDR(node))); break; @@ -4254,12 +4648,12 @@ setup_call(Node* node, ScanEnv* env, int state) if (QUANT_(node)->upper == 0) state |= IN_ZERO_REPEAT; - r = setup_call(NODE_BODY(node), env, state); + r = tune_call(NODE_BODY(node), env, state); break; case NODE_ANCHOR: if (ANCHOR_HAS_BODY(ANCHOR_(node))) - r = setup_call(NODE_BODY(node), env, state); + r = tune_call(NODE_BODY(node), env, state); else r = 0; break; @@ -4273,20 +4667,20 @@ setup_call(Node* node, ScanEnv* env, int state) NODE_STATUS_ADD(node, IN_ZERO_REPEAT); BAG_(node)->m.entry_count--; } - r = setup_call(NODE_BODY(node), env, state); + r = tune_call(NODE_BODY(node), env, state); } else if (en->type == BAG_IF_ELSE) { - r = setup_call(NODE_BODY(node), env, state); + r = tune_call(NODE_BODY(node), env, state); if (r != 0) return r; if (IS_NOT_NULL(en->te.Then)) { - r = setup_call(en->te.Then, env, state); + r = tune_call(en->te.Then, env, state); if (r != 0) return r; } if (IS_NOT_NULL(en->te.Else)) - r = setup_call(en->te.Else, env, state); + r = tune_call(en->te.Else, env, state); } else - r = setup_call(NODE_BODY(node), env, state); + r = tune_call(NODE_BODY(node), env, state); } break; @@ -4296,7 +4690,7 @@ setup_call(Node* node, ScanEnv* env, int state) CALL_(node)->entry_count--; } - r = setup_call_node_call(CALL_(node), env, state); + r = tune_call_node_call(CALL_(node), env, state); break; default: @@ -4308,7 +4702,7 @@ setup_call(Node* node, ScanEnv* env, int state) } static int -setup_call2(Node* node) +tune_call2(Node* node) { int r = 0; @@ -4316,23 +4710,23 @@ setup_call2(Node* node) case NODE_LIST: case NODE_ALT: do { - r = setup_call2(NODE_CAR(node)); + r = tune_call2(NODE_CAR(node)); } while (r == 0 && IS_NOT_NULL(node = NODE_CDR(node))); break; case NODE_QUANT: if (QUANT_(node)->upper != 0) - r = setup_call2(NODE_BODY(node)); + r = tune_call2(NODE_BODY(node)); break; case NODE_ANCHOR: if (ANCHOR_HAS_BODY(ANCHOR_(node))) - r = setup_call2(NODE_BODY(node)); + r = tune_call2(NODE_BODY(node)); break; case NODE_BAG: if (! NODE_IS_IN_ZERO_REPEAT(node)) - r = setup_call2(NODE_BODY(node)); + r = tune_call2(NODE_BODY(node)); { BagNode* en = BAG_(node); @@ -4340,18 +4734,18 @@ setup_call2(Node* node) if (r != 0) return r; if (en->type == BAG_IF_ELSE) { if (IS_NOT_NULL(en->te.Then)) { - r = setup_call2(en->te.Then); + r = tune_call2(en->te.Then); if (r != 0) return r; } if (IS_NOT_NULL(en->te.Else)) - r = setup_call2(en->te.Else); + r = tune_call2(en->te.Else); } } break; case NODE_CALL: if (! NODE_IS_IN_ZERO_REPEAT(node)) { - setup_call2_call(node); + tune_call2_call(node); } break; @@ -4364,7 +4758,7 @@ setup_call2(Node* node) static void -setup_called_state_call(Node* node, int state) +tune_called_state_call(Node* node, int state) { switch (NODE_TYPE(node)) { case NODE_ALT: @@ -4372,7 +4766,7 @@ setup_called_state_call(Node* node, int state) /* fall */ case NODE_LIST: do { - setup_called_state_call(NODE_CAR(node), state); + tune_called_state_call(NODE_CAR(node), state); } while (IS_NOT_NULL(node = NODE_CDR(node))); break; @@ -4385,7 +4779,7 @@ setup_called_state_call(Node* node, int state) if (qn->lower != qn->upper) state |= IN_VAR_REPEAT; - setup_called_state_call(NODE_QUANT_BODY(qn), state); + tune_called_state_call(NODE_QUANT_BODY(qn), state); } break; @@ -4400,7 +4794,7 @@ setup_called_state_call(Node* node, int state) /* fall */ case ANCR_PREC_READ: case ANCR_LOOK_BEHIND: - setup_called_state_call(NODE_ANCHOR_BODY(an), state); + tune_called_state_call(NODE_ANCHOR_BODY(an), state); break; default: break; @@ -4416,31 +4810,33 @@ setup_called_state_call(Node* node, int state) if (NODE_IS_MARK1(node)) { if ((~en->m.called_state & state) != 0) { en->m.called_state |= state; - setup_called_state_call(NODE_BODY(node), state); + tune_called_state_call(NODE_BODY(node), state); } } else { NODE_STATUS_ADD(node, MARK1); en->m.called_state |= state; - setup_called_state_call(NODE_BODY(node), state); + tune_called_state_call(NODE_BODY(node), state); NODE_STATUS_REMOVE(node, MARK1); } } else if (en->type == BAG_IF_ELSE) { + state |= IN_ALT; + tune_called_state_call(NODE_BODY(node), state); if (IS_NOT_NULL(en->te.Then)) { - setup_called_state_call(en->te.Then, state); + tune_called_state_call(en->te.Then, state); } if (IS_NOT_NULL(en->te.Else)) - setup_called_state_call(en->te.Else, state); + tune_called_state_call(en->te.Else, state); } else { - setup_called_state_call(NODE_BODY(node), state); + tune_called_state_call(NODE_BODY(node), state); } } break; case NODE_CALL: - setup_called_state_call(NODE_BODY(node), state); + tune_called_state_call(NODE_BODY(node), state); break; default: @@ -4449,7 +4845,7 @@ setup_called_state_call(Node* node, int state) } static void -setup_called_state(Node* node, int state) +tune_called_state(Node* node, int state) { switch (NODE_TYPE(node)) { case NODE_ALT: @@ -4457,13 +4853,13 @@ setup_called_state(Node* node, int state) /* fall */ case NODE_LIST: do { - setup_called_state(NODE_CAR(node), state); + tune_called_state(NODE_CAR(node), state); } while (IS_NOT_NULL(node = NODE_CDR(node))); break; #ifdef USE_CALL case NODE_CALL: - setup_called_state_call(node, state); + tune_called_state_call(node, state); break; #endif @@ -4480,14 +4876,15 @@ setup_called_state(Node* node, int state) /* fall */ case BAG_OPTION: case BAG_STOP_BACKTRACK: - setup_called_state(NODE_BODY(node), state); + tune_called_state(NODE_BODY(node), state); break; case BAG_IF_ELSE: - setup_called_state(NODE_BODY(node), state); + state |= IN_ALT; + tune_called_state(NODE_BODY(node), state); if (IS_NOT_NULL(en->te.Then)) - setup_called_state(en->te.Then, state); + tune_called_state(en->te.Then, state); if (IS_NOT_NULL(en->te.Else)) - setup_called_state(en->te.Else, state); + tune_called_state(en->te.Else, state); break; } } @@ -4502,7 +4899,7 @@ setup_called_state(Node* node, int state) if (qn->lower != qn->upper) state |= IN_VAR_REPEAT; - setup_called_state(NODE_QUANT_BODY(qn), state); + tune_called_state(NODE_QUANT_BODY(qn), state); } break; @@ -4517,7 +4914,7 @@ setup_called_state(Node* node, int state) /* fall */ case ANCR_PREC_READ: case ANCR_LOOK_BEHIND: - setup_called_state(NODE_ANCHOR_BODY(an), state); + tune_called_state(NODE_ANCHOR_BODY(an), state); break; default: break; @@ -4538,13 +4935,13 @@ setup_called_state(Node* node, int state) #endif /* USE_CALL */ -static int setup_tree(Node* node, regex_t* reg, int state, ScanEnv* env); +static int tune_tree(Node* node, regex_t* reg, int state, ScanEnv* env); #ifdef __GNUC__ __inline #endif static int -setup_anchor(Node* node, regex_t* reg, int state, ScanEnv* env) +tune_anchor(Node* node, regex_t* reg, int state, ScanEnv* env) { /* allowed node types in look-behind */ #define ALLOWED_TYPE_IN_LB \ @@ -4572,10 +4969,10 @@ setup_anchor(Node* node, regex_t* reg, int state, ScanEnv* env) switch (an->type) { case ANCR_PREC_READ: - r = setup_tree(NODE_ANCHOR_BODY(an), reg, state, env); + r = tune_tree(NODE_ANCHOR_BODY(an), reg, state, env); break; case ANCR_PREC_READ_NOT: - r = setup_tree(NODE_ANCHOR_BODY(an), reg, (state | IN_NOT), env); + r = tune_tree(NODE_ANCHOR_BODY(an), reg, (state | IN_NOT), env); break; case ANCR_LOOK_BEHIND: @@ -4584,9 +4981,9 @@ setup_anchor(Node* node, regex_t* reg, int state, ScanEnv* env) ALLOWED_BAG_IN_LB, ALLOWED_ANCHOR_IN_LB); if (r < 0) return r; if (r > 0) return ONIGERR_INVALID_LOOK_BEHIND_PATTERN; - r = setup_tree(NODE_ANCHOR_BODY(an), reg, (state|IN_LOOK_BEHIND), env); + r = tune_tree(NODE_ANCHOR_BODY(an), reg, (state|IN_LOOK_BEHIND), env); if (r != 0) return r; - r = setup_look_behind(node, reg, env); + r = tune_look_behind(node, reg, env); } break; @@ -4596,10 +4993,10 @@ setup_anchor(Node* node, regex_t* reg, int state, ScanEnv* env) ALLOWED_BAG_IN_LB_NOT, ALLOWED_ANCHOR_IN_LB_NOT); if (r < 0) return r; if (r > 0) return ONIGERR_INVALID_LOOK_BEHIND_PATTERN; - r = setup_tree(NODE_ANCHOR_BODY(an), reg, (state|IN_NOT|IN_LOOK_BEHIND), - env); + r = tune_tree(NODE_ANCHOR_BODY(an), reg, (state|IN_NOT|IN_LOOK_BEHIND), + env); if (r != 0) return r; - r = setup_look_behind(node, reg, env); + r = tune_look_behind(node, reg, env); } break; @@ -4615,7 +5012,7 @@ setup_anchor(Node* node, regex_t* reg, int state, ScanEnv* env) __inline #endif static int -setup_quant(Node* node, regex_t* reg, int state, ScanEnv* env) +tune_quant(Node* node, regex_t* reg, int state, ScanEnv* env) { int r; OnigLen d; @@ -4634,12 +5031,6 @@ setup_quant(Node* node, regex_t* reg, int state, ScanEnv* env) if (d == 0) { #ifdef USE_STUBBORN_CHECK_CAPTURES_IN_EMPTY_REPEAT qn->emptiness = quantifiers_memory_node_info(body); - if (qn->emptiness == BODY_IS_EMPTY_POSSIBILITY_REC) { - if (NODE_TYPE(body) == NODE_BAG && - BAG_(body)->type == BAG_MEMORY) { - MEM_STATUS_ON(env->bt_mem_end, BAG_(body)->m.regnum); - } - } #else qn->emptiness = BODY_IS_EMPTY_POSSIBILITY; #endif @@ -4651,7 +5042,7 @@ setup_quant(Node* node, regex_t* reg, int state, ScanEnv* env) if (qn->lower != qn->upper) state |= IN_VAR_REPEAT; - r = setup_tree(body, reg, state, env); + r = tune_tree(body, reg, state, env); if (r != 0) return r; /* expand string */ @@ -4660,13 +5051,12 @@ setup_quant(Node* node, regex_t* reg, int state, ScanEnv* env) if (!IS_INFINITE_REPEAT(qn->lower) && qn->lower == qn->upper && qn->lower > 1 && qn->lower <= EXPAND_STRING_MAX_LENGTH) { int len = NODE_STRING_LEN(body); - StrNode* sn = STR_(body); if (len * qn->lower <= EXPAND_STRING_MAX_LENGTH) { int i, n = qn->lower; - onig_node_conv_to_str_node(node, STR_(body)->flag); + node_conv_to_str_node(node, STR_(body)->flag); for (i = 0; i < n; i++) { - r = onig_node_str_cat(node, sn->s, sn->end); + r = node_str_node_cat(node, body); if (r != 0) return r; } onig_node_free(body); @@ -4691,7 +5081,7 @@ setup_quant(Node* node, regex_t* reg, int state, ScanEnv* env) return r; } -/* setup_tree does the following work. +/* tune_tree does the following work. 1. check empty loop. (set qn->emptiness) 2. expand ignore-case in char class. 3. set memory status bit flags. (reg->mem_stats) @@ -4700,7 +5090,7 @@ setup_quant(Node* node, regex_t* reg, int state, ScanEnv* env) 6. expand repeated string. */ static int -setup_tree(Node* node, regex_t* reg, int state, ScanEnv* env) +tune_tree(Node* node, regex_t* reg, int state, ScanEnv* env) { int r = 0; @@ -4709,9 +5099,9 @@ setup_tree(Node* node, regex_t* reg, int state, ScanEnv* env) { Node* prev = NULL_NODE; do { - r = setup_tree(NODE_CAR(node), reg, state, env); + r = tune_tree(NODE_CAR(node), reg, state, env); if (IS_NOT_NULL(prev) && r == 0) { - r = next_setup(prev, NODE_CAR(node), reg); + r = tune_next(prev, NODE_CAR(node), reg); } prev = NODE_CAR(node); } while (r == 0 && IS_NOT_NULL(node = NODE_CDR(node))); @@ -4720,13 +5110,13 @@ setup_tree(Node* node, regex_t* reg, int state, ScanEnv* env) case NODE_ALT: do { - r = setup_tree(NODE_CAR(node), reg, (state | IN_ALT), env); + r = tune_tree(NODE_CAR(node), reg, (state | IN_ALT), env); } while (r == 0 && IS_NOT_NULL(node = NODE_CDR(node))); break; case NODE_STRING: - if (IS_IGNORECASE(reg->options) && !NODE_STRING_IS_RAW(node)) { - r = expand_case_fold_string(node, reg, state); + if (IS_IGNORECASE(reg->options) && !NODE_STRING_IS_CRUDE(node)) { + r = unravel_case_fold_string(node, reg, state); } break; @@ -4739,11 +5129,17 @@ setup_tree(Node* node, regex_t* reg, int state, ScanEnv* env) for (i = 0; i < br->back_num; i++) { if (p[i] > env->num_mem) return ONIGERR_INVALID_BACKREF; MEM_STATUS_ON(env->backrefed_mem, p[i]); - MEM_STATUS_ON(env->bt_mem_start, p[i]); +#if 0 #ifdef USE_BACKREF_WITH_LEVEL if (NODE_IS_NEST_LEVEL(node)) { - MEM_STATUS_ON(env->bt_mem_end, p[i]); + MEM_STATUS_ON(env->backtrack_mem, p[i]); } +#endif +#else + /* More precisely, it should be checked whether alt/repeat exists before + the subject capture node, and then this backreference position + exists before (or in) the capture node. */ + MEM_STATUS_ON(env->backtrack_mem, p[i]); #endif } } @@ -4758,7 +5154,7 @@ setup_tree(Node* node, regex_t* reg, int state, ScanEnv* env) { OnigOptionType options = reg->options; reg->options = BAG_(node)->o.options; - r = setup_tree(NODE_BODY(node), reg, state, env); + r = tune_tree(NODE_BODY(node), reg, state, env); reg->options = options; } break; @@ -4770,15 +5166,15 @@ setup_tree(Node* node, regex_t* reg, int state, ScanEnv* env) if ((state & (IN_ALT | IN_NOT | IN_VAR_REPEAT | IN_MULTI_ENTRY)) != 0 || NODE_IS_RECURSION(node)) { - MEM_STATUS_ON(env->bt_mem_start, en->m.regnum); + MEM_STATUS_ON(env->backtrack_mem, en->m.regnum); } - r = setup_tree(NODE_BODY(node), reg, state, env); + r = tune_tree(NODE_BODY(node), reg, state, env); break; case BAG_STOP_BACKTRACK: { Node* target = NODE_BODY(node); - r = setup_tree(target, reg, state, env); + r = tune_tree(target, reg, state, env); if (NODE_TYPE(target) == NODE_QUANT) { QuantNode* tqn = QUANT_(target); if (IS_INFINITE_REPEAT(tqn->upper) && tqn->lower <= 1 && @@ -4791,25 +5187,25 @@ setup_tree(Node* node, regex_t* reg, int state, ScanEnv* env) break; case BAG_IF_ELSE: - r = setup_tree(NODE_BODY(node), reg, (state | IN_ALT), env); + r = tune_tree(NODE_BODY(node), reg, (state | IN_ALT), env); if (r != 0) return r; if (IS_NOT_NULL(en->te.Then)) { - r = setup_tree(en->te.Then, reg, (state | IN_ALT), env); + r = tune_tree(en->te.Then, reg, (state | IN_ALT), env); if (r != 0) return r; } if (IS_NOT_NULL(en->te.Else)) - r = setup_tree(en->te.Else, reg, (state | IN_ALT), env); + r = tune_tree(en->te.Else, reg, (state | IN_ALT), env); break; } } break; case NODE_QUANT: - r = setup_quant(node, reg, state, env); + r = tune_quant(node, reg, state, env); break; case NODE_ANCHOR: - r = setup_anchor(node, reg, state, env); + r = tune_anchor(node, reg, state, env); break; #ifdef USE_CALL @@ -4908,7 +5304,7 @@ typedef struct { } MinMax; typedef struct { - MinMax mmd; + MinMax mm; OnigEncoding enc; OnigOptionType options; OnigCaseFoldType case_fold_flag; @@ -4921,17 +5317,16 @@ typedef struct { } OptAnc; typedef struct { - MinMax mmd; /* position */ + MinMax mm; /* position */ OptAnc anc; int reach_end; int case_fold; - int good_case_fold; int len; UChar s[OPT_EXACT_MAXLEN]; } OptStr; typedef struct { - MinMax mmd; /* position */ + MinMax mm; /* position */ OptAnc anc; int value; /* weighted value */ UChar map[CHAR_MAP_SIZE]; @@ -5148,11 +5543,10 @@ is_full_opt_exact(OptStr* e) static void clear_opt_exact(OptStr* e) { - clear_mml(&e->mmd); + clear_mml(&e->mm); clear_opt_anc_info(&e->anc); e->reach_end = 0; e->case_fold = 0; - e->good_case_fold = 0; e->len = 0; e->s[0] = '\0'; } @@ -5176,11 +5570,6 @@ concat_opt_exact(OptStr* to, OptStr* add, OnigEncoding enc) to->case_fold = 1; } - else { - if (to->good_case_fold != 0) { - if (add->good_case_fold == 0) return 0; - } - } } r = 0; @@ -5235,7 +5624,7 @@ alt_merge_opt_exact(OptStr* to, OptStr* add, OptEnv* env) return ; } - if (! is_equal_mml(&to->mmd, &add->mmd)) { + if (! is_equal_mml(&to->mm, &add->mm)) { clear_opt_exact(to); return ; } @@ -5257,8 +5646,6 @@ alt_merge_opt_exact(OptStr* to, OptStr* add, OptEnv* env) to->len = i; if (add->case_fold != 0) to->case_fold = 1; - if (add->good_case_fold == 0) - to->good_case_fold = 0; alt_merge_opt_anc_info(&to->anc, &add->anc); if (! to->reach_end) to->anc.right = 0; @@ -5291,10 +5678,7 @@ select_opt_exact(OnigEncoding enc, OptStr* now, OptStr* alt) if (now->case_fold == 0) vn *= 2; if (alt->case_fold == 0) va *= 2; - if (now->good_case_fold != 0) vn *= 4; - if (alt->good_case_fold != 0) va *= 4; - - if (comp_distance_value(&now->mmd, &alt->mmd, vn, va) > 0) + if (comp_distance_value(&now->mm, &alt->mm, vn, va) > 0) copy_opt_exact(now, alt); } @@ -5378,7 +5762,7 @@ select_opt_map(OptMap* now, OptMap* alt) vn = z / now->value; va = z / alt->value; - if (comp_distance_value(&now->mmd, &alt->mmd, vn, va) > 0) + if (comp_distance_value(&now->mm, &alt->mm, vn, va) > 0) copy_opt_map(now, alt); } @@ -5392,17 +5776,14 @@ comp_opt_exact_or_map(OptStr* e, OptMap* m) if (m->value <= 0) return -1; if (e->case_fold != 0) { - if (e->good_case_fold != 0) - case_value = 2; - else - case_value = 1; + case_value = 1; } else case_value = 3; ae = COMP_EM_BASE * e->len * case_value; am = COMP_EM_BASE * 5 * 2 / m->value; - return comp_distance_value(&e->mmd, &m->mmd, ae, am); + return comp_distance_value(&e->mm, &m->mm, ae, am); } static void @@ -5410,14 +5791,14 @@ alt_merge_opt_map(OnigEncoding enc, OptMap* to, OptMap* add) { int i, val; - /* if (! is_equal_mml(&to->mmd, &add->mmd)) return ; */ + /* if (! is_equal_mml(&to->mm, &add->mm)) return ; */ if (to->value == 0) return ; - if (add->value == 0 || to->mmd.max < add->mmd.min) { + if (add->value == 0 || to->mm.max < add->mm.min) { clear_opt_map(to); return ; } - alt_merge_mml(&to->mmd, &add->mmd); + alt_merge_mml(&to->mm, &add->mm); val = 0; for (i = 0; i < CHAR_MAP_SIZE; i++) { @@ -5435,9 +5816,9 @@ alt_merge_opt_map(OnigEncoding enc, OptMap* to, OptMap* add) static void set_bound_node_opt_info(OptNode* opt, MinMax* plen) { - copy_mml(&(opt->sb.mmd), plen); - copy_mml(&(opt->spr.mmd), plen); - copy_mml(&(opt->map.mmd), plen); + copy_mml(&(opt->sb.mm), plen); + copy_mml(&(opt->spr.mm), plen); + copy_mml(&(opt->map.mm), plen); } static void @@ -5472,7 +5853,7 @@ concat_left_node_opt_info(OnigEncoding enc, OptNode* to, OptNode* add) } if (add->map.value > 0 && to->len.max == 0) { - if (add->map.mmd.max == 0) + if (add->map.mm.max == 0) add->map.anc.left |= to->anc.left; } @@ -5497,10 +5878,7 @@ concat_left_node_opt_info(OnigEncoding enc, OptNode* to, OptNode* add) if (to->spr.len > 0) { if (add->len.max > 0) { - if (to->spr.len > (int )add->len.max) - to->spr.len = add->len.max; - - if (to->spr.mmd.max == 0) + if (to->spr.mm.max == 0) select_opt_exact(enc, &to->sb, &to->spr); else select_opt_exact(enc, &to->sm, &to->spr); @@ -5540,7 +5918,7 @@ optimize_nodes(Node* node, OptNode* opt, OptEnv* env) r = 0; enc = env->enc; clear_node_opt_info(opt); - set_bound_node_opt_info(opt, &env->mmd); + set_bound_node_opt_info(opt, &env->mm); switch (NODE_TYPE(node)) { case NODE_LIST: @@ -5552,7 +5930,7 @@ optimize_nodes(Node* node, OptNode* opt, OptEnv* env) do { r = optimize_nodes(NODE_CAR(nd), &xo, &nenv); if (r == 0) { - add_mml(&nenv.mmd, &xo.len); + add_mml(&nenv.mm, &xo.len); concat_left_node_opt_info(enc, opt, &xo); } } while (r == 0 && IS_NOT_NULL(nd = NODE_CDR(nd))); @@ -5577,9 +5955,8 @@ optimize_nodes(Node* node, OptNode* opt, OptEnv* env) { StrNode* sn = STR_(node); int slen = (int )(sn->end - sn->s); - /* int is_raw = NODE_STRING_IS_RAW(node); */ - if (! NODE_STRING_IS_AMBIG(node)) { + if (! NODE_STRING_IS_CASE_FOLD_MATCH(node)) { concat_opt_exact_str(&opt->sb, sn->s, sn->end, enc); if (slen > 0) { add_char_opt_map(&opt->map, *(sn->s), enc); @@ -5587,28 +5964,20 @@ optimize_nodes(Node* node, OptNode* opt, OptEnv* env) set_mml(&opt->len, slen, slen); } else { - int max; + int max, min; - if (NODE_STRING_IS_DONT_GET_OPT_INFO(node)) { - int n = onigenc_strlen(enc, sn->s, sn->end); - max = ONIGENC_MBC_MAXLEN_DIST(enc) * n; - } - else { - concat_opt_exact_str(&opt->sb, sn->s, sn->end, enc); - opt->sb.case_fold = 1; - if (NODE_STRING_IS_GOOD_AMBIG(node)) - opt->sb.good_case_fold = 1; - - if (slen > 0) { - r = add_char_amb_opt_map(&opt->map, sn->s, sn->end, - enc, env->case_fold_flag); - if (r != 0) break; - } + concat_opt_exact_str(&opt->sb, sn->s, sn->end, enc); + opt->sb.case_fold = 1; - max = slen; + if (slen > 0) { + r = add_char_amb_opt_map(&opt->map, sn->s, sn->end, + enc, env->case_fold_flag); + if (r != 0) break; } - set_mml(&opt->len, slen, max); + max = slen; + min = sn->case_min_len * ONIGENC_MBC_MINLEN(enc); + set_mml(&opt->len, min, max); } } break; @@ -5618,7 +5987,7 @@ optimize_nodes(Node* node, OptNode* opt, OptEnv* env) int z; CClassNode* cc = CCLASS_(node); - /* no need to check ignore case. (set in setup_tree()) */ + /* no need to check ignore case. (set in tune_tree()) */ if (IS_NOT_NULL(cc->mbuf) || IS_NCCLASS_NOT(cc)) { OnigLen min = ONIGENC_MBC_MINLEN(enc); @@ -5728,11 +6097,11 @@ optimize_nodes(Node* node, OptNode* opt, OptEnv* env) break; } backs = BACKREFS_P(br); - min = tree_min_len(mem_env[backs[0]].node, env->scan_env); - max = tree_max_len(mem_env[backs[0]].node, env->scan_env); + min = tree_min_len(mem_env[backs[0]].mem_node, env->scan_env); + max = tree_max_len(mem_env[backs[0]].mem_node, env->scan_env); for (i = 1; i < br->back_num; i++) { - tmin = tree_min_len(mem_env[backs[i]].node, env->scan_env); - tmax = tree_max_len(mem_env[backs[i]].node, env->scan_env); + tmin = tree_min_len(mem_env[backs[i]].mem_node, env->scan_env); + tmax = tree_max_len(mem_env[backs[i]].mem_node, env->scan_env); if (min > tmin) min = tmin; if (max < tmax) max = tmax; } @@ -5782,7 +6151,7 @@ optimize_nodes(Node* node, OptNode* opt, OptEnv* env) } if (IS_INFINITE_REPEAT(qn->upper)) { - if (env->mmd.max == 0 && + if (env->mm.max == 0 && NODE_IS_ANYCHAR(NODE_BODY(node)) && qn->greedy != 0) { if (IS_MULTILINE(CTYPE_OPTION(NODE_QUANT_BODY(qn), env))) add_opt_anc_info(&opt->anc, ANCR_ANYCHAR_INF_ML); @@ -5850,7 +6219,7 @@ optimize_nodes(Node* node, OptNode* opt, OptEnv* env) copy_opt_env(&nenv, env); r = optimize_nodes(NODE_BAG_BODY(en), &xo, &nenv); if (r == 0) { - add_mml(&nenv.mmd, &xo.len); + add_mml(&nenv.mm, &xo.len); concat_left_node_opt_info(enc, opt, &xo); if (IS_NOT_NULL(en->te.Then)) { r = optimize_nodes(en->te.Then, &xo, &nenv); @@ -5899,15 +6268,6 @@ set_optimize_exact(regex_t* reg, OptStr* e) if (e->case_fold) { reg->optimize = OPTIMIZE_STR_CASE_FOLD; - if (e->good_case_fold != 0) { - if (e->len >= 2) { - r = set_sunday_quick_search_or_bmh_skip_table(reg, 1, - reg->exact, reg->exact_end, - reg->map, &(reg->map_offset)); - if (r != 0) return r; - reg->optimize = OPTIMIZE_STR_CASE_FOLD_FAST; - } - } } else { int allow_reverse; @@ -5930,11 +6290,17 @@ set_optimize_exact(regex_t* reg, OptStr* e) } } - reg->dmin = e->mmd.min; - reg->dmax = e->mmd.max; + reg->dist_min = e->mm.min; + reg->dist_max = e->mm.max; + + if (reg->dist_min != INFINITE_LEN) { + int n; + if (e->case_fold != 0) + n = 1; + else + n = (int )(reg->exact_end - reg->exact); - if (reg->dmin != INFINITE_LEN) { - reg->threshold_len = reg->dmin + (int )(reg->exact_end - reg->exact); + reg->threshold_len = reg->dist_min + n; } return 0; @@ -5949,11 +6315,11 @@ set_optimize_map(regex_t* reg, OptMap* m) reg->map[i] = m->map[i]; reg->optimize = OPTIMIZE_MAP; - reg->dmin = m->mmd.min; - reg->dmax = m->mmd.max; + reg->dist_min = m->mm.min; + reg->dist_max = m->mm.max; - if (reg->dmin != INFINITE_LEN) { - reg->threshold_len = reg->dmin + 1; + if (reg->dist_min != INFINITE_LEN) { + reg->threshold_len = reg->dist_min + 1; } } @@ -5979,7 +6345,7 @@ set_optimize_info_from_tree(Node* node, regex_t* reg, ScanEnv* scan_env) env.options = reg->options; env.case_fold_flag = reg->case_fold_flag; env.scan_env = scan_env; - clear_mml(&env.mmd); + clear_mml(&env.mm); r = optimize_nodes(node, &opt, &env); if (r != 0) return r; @@ -5995,8 +6361,8 @@ set_optimize_info_from_tree(Node* node, regex_t* reg, ScanEnv* scan_env) ANCR_PREC_READ_NOT); if (reg->anchor & (ANCR_END_BUF | ANCR_SEMI_END_BUF)) { - reg->anchor_dmin = opt.len.min; - reg->anchor_dmax = opt.len.max; + reg->anc_dist_min = opt.len.min; + reg->anc_dist_max = opt.len.max; } if (opt.sb.len > 0 || opt.sm.len > 0) { @@ -6031,8 +6397,8 @@ clear_optimize_info(regex_t* reg) { reg->optimize = OPTIMIZE_NONE; reg->anchor = 0; - reg->anchor_dmin = 0; - reg->anchor_dmax = 0; + reg->anc_dist_min = 0; + reg->anc_dist_max = 0; reg->sub_anchor = 0; reg->exact_end = (UChar* )NULL; reg->map_offset = 0; @@ -6151,12 +6517,12 @@ print_optimize_info(FILE* f, regex_t* reg) { static const char* on[] = { "NONE", "STR", "STR_FAST", "STR_FAST_STEP_FORWARD", - "STR_CASE_FOLD_FAST", "STR_CASE_FOLD", "MAP" }; + "STR_CASE_FOLD", "MAP" }; fprintf(f, "optimize: %s\n", on[reg->optimize]); fprintf(f, " anchor: "); print_anchor(f, reg->anchor); if ((reg->anchor & ANCR_END_BUF_MASK) != 0) - print_distance_range(f, reg->anchor_dmin, reg->anchor_dmax); + print_distance_range(f, reg->anc_dist_min, reg->anc_dist_max); fprintf(f, "\n"); if (reg->optimize) { @@ -6304,7 +6670,7 @@ onig_compile(regex_t* reg, const UChar* pattern, const UChar* pattern_end, Node* root; ScanEnv scan_env; #ifdef USE_CALL - UnsetAddrList uslist; + UnsetAddrList uslist = {0}; #endif root = 0; @@ -6328,13 +6694,17 @@ onig_compile(regex_t* reg, const UChar* pattern, const UChar* pattern_end, reg->string_pool_end = 0; reg->num_mem = 0; reg->num_repeat = 0; - reg->num_null_check = 0; + reg->num_empty_check = 0; reg->repeat_range_alloc = 0; - reg->repeat_range = (OnigRepeatRange* )NULL; + reg->repeat_range = (RepeatRange* )NULL; + reg->empty_status_mem = 0; r = onig_parse_tree(&root, pattern, pattern_end, reg, &scan_env); if (r != 0) goto err; + r = reduce_string_list(root); + if (r != 0) goto err; + /* mixed use named group and no-named group */ if (scan_env.num_named > 0 && IS_SYNTAX_BV(scan_env.syntax, ONIG_SYN_CAPTURE_ONLY_NAMED_GROUP) && @@ -6355,38 +6725,65 @@ onig_compile(regex_t* reg, const UChar* pattern, const UChar* pattern_end, r = unset_addr_list_init(&uslist, scan_env.num_call); if (r != 0) goto err; scan_env.unset_addr_list = &uslist; - r = setup_call(root, &scan_env, 0); + r = tune_call(root, &scan_env, 0); if (r != 0) goto err_unset; - r = setup_call2(root); + r = tune_call2(root); if (r != 0) goto err_unset; r = recursive_call_check_trav(root, &scan_env, 0); if (r < 0) goto err_unset; r = infinite_recursive_call_check_trav(root, &scan_env); if (r != 0) goto err_unset; - setup_called_state(root, 0); + tune_called_state(root, 0); } reg->num_call = scan_env.num_call; #endif - r = setup_tree(root, reg, 0, &scan_env); +#ifdef ONIG_DEBUG_PARSE + fprintf(stderr, "MAX PARSE DEPTH: %d\n", scan_env.max_parse_depth); + fprintf(stderr, "TREE (parsed)\n"); + print_tree(stderr, root); + fprintf(stderr, "\n"); +#endif + + r = tune_tree(root, reg, 0, &scan_env); if (r != 0) goto err_unset; + if (scan_env.backref_num != 0) { + set_parent_node_trav(root, NULL_NODE); + r = set_empty_repeat_node_trav(root, NULL_NODE, &scan_env); + if (r != 0) goto err_unset; + set_empty_status_check_trav(root, &scan_env); + } + #ifdef ONIG_DEBUG_PARSE + fprintf(stderr, "TREE (after tune)\n"); print_tree(stderr, root); + fprintf(stderr, "\n"); #endif - reg->capture_history = scan_env.capture_history; - reg->bt_mem_start = scan_env.bt_mem_start; - reg->bt_mem_start |= reg->capture_history; - if (IS_FIND_CONDITION(reg->options)) - MEM_STATUS_ON_ALL(reg->bt_mem_end); + reg->capture_history = scan_env.cap_history; + reg->push_mem_start = scan_env.backtrack_mem | scan_env.cap_history; + +#ifdef USE_CALLOUT + if (IS_NOT_NULL(reg->extp) && reg->extp->callout_num != 0) { + reg->push_mem_end = reg->push_mem_start; + } else { - reg->bt_mem_end = scan_env.bt_mem_end; - reg->bt_mem_end |= reg->capture_history; + if (MEM_STATUS_IS_ALL_ON(reg->push_mem_start)) + reg->push_mem_end = scan_env.backrefed_mem | scan_env.cap_history; + else + reg->push_mem_end = reg->push_mem_start & + (scan_env.backrefed_mem | scan_env.cap_history); } - reg->bt_mem_start |= reg->bt_mem_end; +#else + if (MEM_STATUS_IS_ALL_ON(reg->push_mem_start)) + reg->push_mem_end = scan_env.backrefed_mem | scan_env.cap_history; + else + reg->push_mem_end = reg->push_mem_start & + (scan_env.backrefed_mem | scan_env.cap_history); +#endif clear_optimize_info(reg); #ifndef ONIG_DONT_OPTIMIZE @@ -6420,14 +6817,20 @@ onig_compile(regex_t* reg, const UChar* pattern, const UChar* pattern_end, } #endif - if ((reg->num_repeat != 0) || (reg->bt_mem_end != 0) + set_addr_in_repeat_range(reg); + + if ((reg->push_mem_end != 0) +#ifdef USE_REPEAT_AND_EMPTY_CHECK_LOCAL_VAR + || (reg->num_repeat != 0) + || (reg->num_empty_check != 0) +#endif #ifdef USE_CALLOUT || (IS_NOT_NULL(reg->extp) && reg->extp->callout_num != 0) #endif ) reg->stack_pop_level = STACK_POP_LEVEL_ALL; else { - if (reg->bt_mem_start != 0) + if (reg->push_mem_start != 0) reg->stack_pop_level = STACK_POP_LEVEL_MEM_START; else reg->stack_pop_level = STACK_POP_LEVEL_FREE; @@ -6560,11 +6963,14 @@ onig_new(regex_t** reg, const UChar* pattern, const UChar* pattern_end, if (IS_NULL(*reg)) return ONIGERR_MEMORY; r = onig_reg_init(*reg, option, ONIGENC_CASE_FOLD_DEFAULT, enc, syntax); - if (r != 0) goto err; + if (r != 0) { + xfree(*reg); + *reg = NULL; + return r; + } r = onig_compile(*reg, pattern, pattern_end, einfo); if (r != 0) { - err: onig_free(*reg); *reg = NULL; } @@ -6709,12 +7115,14 @@ onig_is_code_in_cc(OnigEncoding enc, OnigCodePoint code, CClassNode* cc) #ifdef ONIG_DEBUG_PARSE +#ifdef USE_CALL static void p_string(FILE* f, int len, UChar* s) { fputs(":", f); while (len-- > 0) { fputc(*s++, f); } } +#endif static void Indent(FILE* f, int indent) @@ -6734,7 +7142,7 @@ print_indent_tree(FILE* f, Node* node, int indent) Indent(f, indent); if (IS_NULL(node)) { fprintf(f, "ERROR: null node!!!\n"); - exit (0); + exit(0); } type = NODE_TYPE(node); @@ -6758,28 +7166,22 @@ print_indent_tree(FILE* f, Node* node, int indent) case NODE_STRING: { + char* str; char* mode; - char* dont; - char* good; - if (NODE_STRING_IS_RAW(node)) - mode = "-raw"; - else if (NODE_STRING_IS_AMBIG(node)) - mode = "-ambig"; + if (NODE_STRING_IS_CRUDE(node)) + mode = "-crude"; + else if (NODE_STRING_IS_CASE_FOLD_MATCH(node)) + mode = "-case_fold_match"; else mode = ""; - if (NODE_STRING_IS_GOOD_AMBIG(node)) - good = "-good"; - else - good = ""; - - if (NODE_STRING_IS_DONT_GET_OPT_INFO(node)) - dont = " (dont-opt)"; + if (STR_(node)->s == STR_(node)->end) + str = "empty-string"; else - dont = ""; + str = "string"; - fprintf(f, "", mode, good, dont, node); + fprintf(f, "<%s%s:%p>", str, mode, node); for (p = STR_(node)->s; p < STR_(node)->end; p++) { if (*p >= 0x20 && *p < 0x7f) fputc(*p, f); @@ -6901,6 +7303,34 @@ print_indent_tree(FILE* f, Node* node, int indent) case NODE_BAG: fprintf(f, " ", node); + if (BAG_(node)->type == BAG_IF_ELSE) { + Node* Then; + Node* Else; + BagNode* bn; + + bn = BAG_(node); + fprintf(f, "if-else\n"); + print_indent_tree(f, NODE_BODY(node), indent + add); + + Then = bn->te.Then; + Else = bn->te.Else; + if (IS_NULL(Then)) { + Indent(f, indent + add); + fprintf(f, "THEN empty\n"); + } + else + print_indent_tree(f, Then, indent + add); + + if (IS_NULL(Else)) { + Indent(f, indent + add); + fprintf(f, "ELSE empty\n"); + } + else + print_indent_tree(f, Else, indent + add); + + break; + } + switch (BAG_(node)->type) { case BAG_OPTION: fprintf(f, "option:%d", BAG_(node)->o.options); @@ -6911,8 +7341,7 @@ print_indent_tree(FILE* f, Node* node, int indent) case BAG_STOP_BACKTRACK: fprintf(f, "stop-bt"); break; - case BAG_IF_ELSE: - fprintf(f, "if-else"); + default: break; } fprintf(f, "\n"); diff --git a/src/regenc.c b/src/regenc.c index 9fab721..16ac313 100644 --- a/src/regenc.c +++ b/src/regenc.c @@ -2,7 +2,7 @@ regenc.c - Oniguruma (regular expression library) **********************************************************************/ /*- - * Copyright (c) 2002-2019 K.Kosako + * Copyright (c) 2002-2019 K.Kosako * All rights reserved. * * Redistribution and use in source and binary forms, with or without @@ -182,7 +182,8 @@ onigenc_get_right_adjust_char_head_with_prev(OnigEncoding enc, p += enclen(enc, p); } else { - if (prev) *prev = (const UChar* )NULL; /* Sorry */ + if (prev) + *prev = onigenc_get_prev_char_head(enc, start, p); } return p; } @@ -208,20 +209,6 @@ onigenc_step_back(OnigEncoding enc, const UChar* start, const UChar* s, int n) return (UChar* )s; } -#if 0 -extern int -onigenc_mbc_enc_len_end(OnigEncoding enc, const UChar* p, const UChar* end) -{ - int len; - int n; - - len = ONIGENC_MBC_ENC_LEN(enc, p); - n = (int )(end - p); - - return (n < len ? n : len); -} -#endif - extern UChar* onigenc_step(OnigEncoding enc, const UChar* p, const UChar* end, int n) { @@ -705,18 +692,6 @@ onigenc_ascii_mbc_case_fold(OnigCaseFoldType flag ARG_UNUSED, const UChar** p, return 1; /* return byte length of converted char to lower */ } -#if 0 -extern int -onigenc_ascii_is_mbc_ambiguous(OnigCaseFoldType flag, - const UChar** pp, const UChar* end) -{ - const UChar* p = *pp; - - (*pp)++; - return ONIGENC_IS_ASCII_CODE_CASE_AMBIG(*p); -} -#endif - extern int onigenc_single_byte_mbc_enc_len(const UChar* p ARG_UNUSED) { @@ -833,41 +808,6 @@ onigenc_mbn_mbc_case_fold(OnigEncoding enc, OnigCaseFoldType flag ARG_UNUSED, } } -#if 0 -extern int -onigenc_mbn_is_mbc_ambiguous(OnigEncoding enc, OnigCaseFoldType flag, - const UChar** pp, const UChar* end) -{ - const UChar* p = *pp; - - if (ONIGENC_IS_MBC_ASCII(p)) { - (*pp)++; - return ONIGENC_IS_ASCII_CODE_CASE_AMBIG(*p); - } - - (*pp) += enclen(enc, p); - return FALSE; -} -#endif - -extern int -onigenc_mb2_code_to_mbclen(OnigCodePoint code) -{ - if ((code & (~0xffff)) != 0) return ONIGERR_INVALID_CODE_POINT_VALUE; - - if ((code & 0xff00) != 0) return 2; - else return 1; -} - -extern int -onigenc_mb4_code_to_mbclen(OnigCodePoint code) -{ - if ((code & 0xff000000) != 0) return 4; - else if ((code & 0xff0000) != 0) return 3; - else if ((code & 0xff00) != 0) return 2; - else return 1; -} - extern int onigenc_mb2_code_to_mbc(OnigEncoding enc, OnigCodePoint code, UChar *buf) { diff --git a/src/regenc.h b/src/regenc.h index bd2819e..db35841 100644 --- a/src/regenc.h +++ b/src/regenc.h @@ -4,7 +4,7 @@ regenc.h - Oniguruma (regular expression library) **********************************************************************/ /*- - * Copyright (c) 2002-2019 K.Kosako + * Copyright (c) 2002-2019 K.Kosako * All rights reserved. * * Redistribution and use in source and binary forms, with or without @@ -163,13 +163,11 @@ extern int onigenc_length_check_is_valid_mbc_string P_((OnigEncoding enc, const /* methods for multi byte encoding */ extern OnigCodePoint onigenc_mbn_mbc_to_code P_((OnigEncoding enc, const UChar* p, const UChar* end)); extern int onigenc_mbn_mbc_case_fold P_((OnigEncoding enc, OnigCaseFoldType flag, const UChar** p, const UChar* end, UChar* lower)); -extern int onigenc_mb2_code_to_mbclen P_((OnigCodePoint code)); extern int onigenc_mb2_code_to_mbc P_((OnigEncoding enc, OnigCodePoint code, UChar *buf)); extern int onigenc_minimum_property_name_to_ctype P_((OnigEncoding enc, UChar* p, UChar* end)); extern int onigenc_unicode_property_name_to_ctype P_((OnigEncoding enc, UChar* p, UChar* end)); extern int onigenc_is_mbc_word_ascii P_((OnigEncoding enc, UChar* s, const UChar* end)); extern int onigenc_mb2_is_code_ctype P_((OnigEncoding enc, OnigCodePoint code, unsigned int ctype)); -extern int onigenc_mb4_code_to_mbclen P_((OnigCodePoint code)); extern int onigenc_mb4_code_to_mbc P_((OnigEncoding enc, OnigCodePoint code, UChar *buf)); extern int onigenc_mb4_is_code_ctype P_((OnigEncoding enc, OnigCodePoint code, unsigned int ctype)); extern struct PropertyNameCtype* onigenc_euc_jp_lookup_property_name P_((register const char *str, register size_t len)); diff --git a/src/regerror.c b/src/regerror.c index e6d1806..b57a276 100644 --- a/src/regerror.c +++ b/src/regerror.c @@ -2,7 +2,7 @@ regerror.c - Oniguruma (regular expression library) **********************************************************************/ /*- - * Copyright (c) 2002-2019 K.Kosako + * Copyright (c) 2002-2019 K.Kosako * All rights reserved. * * Redistribution and use in source and binary forms, with or without diff --git a/src/regexec.c b/src/regexec.c index f957b75..ce498c6 100644 --- a/src/regexec.c +++ b/src/regexec.c @@ -2,7 +2,7 @@ regexec.c - Oniguruma (regular expression library) **********************************************************************/ /*- - * Copyright (c) 2002-2019 K.Kosako + * Copyright (c) 2002-2019 K.Kosako * All rights reserved. * * Redistribution and use in source and binary forms, with or without @@ -39,6 +39,20 @@ #define CHECK_INTERRUPT_IN_MATCH +#define STACK_MEM_START(reg, i) \ + (MEM_STATUS_AT((reg)->push_mem_start, (i)) != 0 ? \ + STACK_AT(mem_start_stk[i])->u.mem.pstr : (UChar* )((void* )(mem_start_stk[i]))) + +#define STACK_MEM_END(reg, i) \ + (MEM_STATUS_AT((reg)->push_mem_end, (i)) != 0 ? \ + STACK_AT(mem_end_stk[i])->u.mem.pstr : (UChar* )((void* )(mem_end_stk[i]))) + +static int forward_search(regex_t* reg, const UChar* str, const UChar* end, UChar* start, UChar* range, UChar** low, UChar** high, UChar** low_prev); + +static int +search_in_range(regex_t* reg, const UChar* str, const UChar* end, const UChar* start, const UChar* range, /* match range */ const UChar* data_range, /* subject string range */ OnigRegion* region, OnigOptionType option, OnigMatchParam* mp); + + #ifdef USE_CALLOUT typedef struct { int last_match_at_call_counter; @@ -129,7 +143,7 @@ typedef struct { } MatchArg; -#ifdef ONIG_DEBUG +#if defined(ONIG_DEBUG_COMPILE) || defined(ONIG_DEBUG_MATCH) /* arguments type */ typedef enum { @@ -149,102 +163,108 @@ typedef struct { } OpInfoType; static OpInfoType OpInfo[] = { - { OP_FINISH, "finish" }, - { OP_END, "end" }, - { OP_EXACT1, "exact1" }, - { OP_EXACT2, "exact2" }, - { OP_EXACT3, "exact3" }, - { OP_EXACT4, "exact4" }, - { OP_EXACT5, "exact5" }, - { OP_EXACTN, "exactn" }, - { OP_EXACTMB2N1, "exactmb2-n1" }, - { OP_EXACTMB2N2, "exactmb2-n2" }, - { OP_EXACTMB2N3, "exactmb2-n3" }, - { OP_EXACTMB2N, "exactmb2-n" }, - { OP_EXACTMB3N, "exactmb3n" }, - { OP_EXACTMBN, "exactmbn" }, - { OP_EXACT1_IC, "exact1-ic" }, - { OP_EXACTN_IC, "exactn-ic" }, - { OP_CCLASS, "cclass" }, - { OP_CCLASS_MB, "cclass-mb" }, - { OP_CCLASS_MIX, "cclass-mix" }, - { OP_CCLASS_NOT, "cclass-not" }, - { OP_CCLASS_MB_NOT, "cclass-mb-not" }, - { OP_CCLASS_MIX_NOT, "cclass-mix-not" }, - { OP_ANYCHAR, "anychar" }, - { OP_ANYCHAR_ML, "anychar-ml" }, - { OP_ANYCHAR_STAR, "anychar*" }, - { OP_ANYCHAR_ML_STAR, "anychar-ml*" }, - { OP_ANYCHAR_STAR_PEEK_NEXT, "anychar*-peek-next" }, - { OP_ANYCHAR_ML_STAR_PEEK_NEXT, "anychar-ml*-peek-next" }, - { OP_WORD, "word" }, - { OP_WORD_ASCII, "word-ascii" }, - { OP_NO_WORD, "not-word" }, - { OP_NO_WORD_ASCII, "not-word-ascii" }, - { OP_WORD_BOUNDARY, "word-boundary" }, - { OP_NO_WORD_BOUNDARY, "not-word-boundary" }, - { OP_WORD_BEGIN, "word-begin" }, - { OP_WORD_END, "word-end" }, - { OP_TEXT_SEGMENT_BOUNDARY, "text-segment-boundary" }, - { OP_BEGIN_BUF, "begin-buf" }, - { OP_END_BUF, "end-buf" }, - { OP_BEGIN_LINE, "begin-line" }, - { OP_END_LINE, "end-line" }, - { OP_SEMI_END_BUF, "semi-end-buf" }, - { OP_BEGIN_POSITION, "begin-position" }, - { OP_BACKREF1, "backref1" }, - { OP_BACKREF2, "backref2" }, - { OP_BACKREF_N, "backref-n" }, - { OP_BACKREF_N_IC, "backref-n-ic" }, - { OP_BACKREF_MULTI, "backref_multi" }, - { OP_BACKREF_MULTI_IC, "backref_multi-ic" }, - { OP_BACKREF_WITH_LEVEL, "backref_with_level" }, - { OP_BACKREF_WITH_LEVEL_IC, "backref_with_level-c" }, - { OP_BACKREF_CHECK, "backref_check" }, - { OP_BACKREF_CHECK_WITH_LEVEL, "backref_check_with_level" }, - { OP_MEMORY_START_PUSH, "mem-start-push" }, - { OP_MEMORY_START, "mem-start" }, - { OP_MEMORY_END_PUSH, "mem-end-push" }, - { OP_MEMORY_END_PUSH_REC, "mem-end-push-rec" }, - { OP_MEMORY_END, "mem-end" }, - { OP_MEMORY_END_REC, "mem-end-rec" }, - { OP_FAIL, "fail" }, - { OP_JUMP, "jump" }, - { OP_PUSH, "push" }, - { OP_PUSH_SUPER, "push-super" }, - { OP_POP_OUT, "pop-out" }, + { OP_FINISH, "finish"}, + { OP_END, "end"}, + { OP_STR_1, "str_1"}, + { OP_STR_2, "str_2"}, + { OP_STR_3, "str_3"}, + { OP_STR_4, "str_4"}, + { OP_STR_5, "str_5"}, + { OP_STR_N, "str_n"}, + { OP_STR_MB2N1, "str_mb2-n1"}, + { OP_STR_MB2N2, "str_mb2-n2"}, + { OP_STR_MB2N3, "str_mb2-n3"}, + { OP_STR_MB2N, "str_mb2-n"}, + { OP_STR_MB3N, "str_mb3n"}, + { OP_STR_MBN, "str_mbn"}, + { OP_STR_1_IC, "str_1-ic"}, + { OP_STR_N_IC, "str_n-ic"}, + { OP_CCLASS, "cclass"}, + { OP_CCLASS_MB, "cclass-mb"}, + { OP_CCLASS_MIX, "cclass-mix"}, + { OP_CCLASS_NOT, "cclass-not"}, + { OP_CCLASS_MB_NOT, "cclass-mb-not"}, + { OP_CCLASS_MIX_NOT, "cclass-mix-not"}, + { OP_ANYCHAR, "anychar"}, + { OP_ANYCHAR_ML, "anychar-ml"}, + { OP_ANYCHAR_STAR, "anychar*"}, + { OP_ANYCHAR_ML_STAR, "anychar-ml*"}, + { OP_ANYCHAR_STAR_PEEK_NEXT, "anychar*-peek-next"}, + { OP_ANYCHAR_ML_STAR_PEEK_NEXT, "anychar-ml*-peek-next"}, + { OP_WORD, "word"}, + { OP_WORD_ASCII, "word-ascii"}, + { OP_NO_WORD, "not-word"}, + { OP_NO_WORD_ASCII, "not-word-ascii"}, + { OP_WORD_BOUNDARY, "word-boundary"}, + { OP_NO_WORD_BOUNDARY, "not-word-boundary"}, + { OP_WORD_BEGIN, "word-begin"}, + { OP_WORD_END, "word-end"}, + { OP_TEXT_SEGMENT_BOUNDARY, "text-segment-boundary"}, + { OP_BEGIN_BUF, "begin-buf"}, + { OP_END_BUF, "end-buf"}, + { OP_BEGIN_LINE, "begin-line"}, + { OP_END_LINE, "end-line"}, + { OP_SEMI_END_BUF, "semi-end-buf"}, + { OP_BEGIN_POSITION, "begin-position"}, + { OP_BACKREF1, "backref1"}, + { OP_BACKREF2, "backref2"}, + { OP_BACKREF_N, "backref-n"}, + { OP_BACKREF_N_IC, "backref-n-ic"}, + { OP_BACKREF_MULTI, "backref_multi"}, + { OP_BACKREF_MULTI_IC, "backref_multi-ic"}, + { OP_BACKREF_WITH_LEVEL, "backref_with_level"}, + { OP_BACKREF_WITH_LEVEL_IC, "backref_with_level-c"}, + { OP_BACKREF_CHECK, "backref_check"}, + { OP_BACKREF_CHECK_WITH_LEVEL, "backref_check_with_level"}, + { OP_MEM_START_PUSH, "mem-start-push"}, + { OP_MEM_START, "mem-start"}, + { OP_MEM_END_PUSH, "mem-end-push"}, +#ifdef USE_CALL + { OP_MEM_END_PUSH_REC, "mem-end-push-rec"}, +#endif + { OP_MEM_END, "mem-end"}, +#ifdef USE_CALL + { OP_MEM_END_REC, "mem-end-rec"}, +#endif + { OP_FAIL, "fail"}, + { OP_JUMP, "jump"}, + { OP_PUSH, "push"}, + { OP_PUSH_SUPER, "push-super"}, + { OP_POP_OUT, "pop-out"}, #ifdef USE_OP_PUSH_OR_JUMP_EXACT - { OP_PUSH_OR_JUMP_EXACT1, "push-or-jump-e1" }, + { OP_PUSH_OR_JUMP_EXACT1, "push-or-jump-e1"}, +#endif + { OP_PUSH_IF_PEEK_NEXT, "push-if-peek-next"}, + { OP_REPEAT, "repeat"}, + { OP_REPEAT_NG, "repeat-ng"}, + { OP_REPEAT_INC, "repeat-inc"}, + { OP_REPEAT_INC_NG, "repeat-inc-ng"}, + { OP_EMPTY_CHECK_START, "empty-check-start"}, + { OP_EMPTY_CHECK_END, "empty-check-end"}, + { OP_EMPTY_CHECK_END_MEMST, "empty-check-end-memst"}, +#ifdef USE_CALL + { OP_EMPTY_CHECK_END_MEMST_PUSH,"empty-check-end-memst-push"}, +#endif + { OP_PREC_READ_START, "push-pos"}, + { OP_PREC_READ_END, "pop-pos"}, + { OP_PREC_READ_NOT_START, "prec-read-not-start"}, + { OP_PREC_READ_NOT_END, "prec-read-not-end"}, + { OP_ATOMIC_START, "atomic-start"}, + { OP_ATOMIC_END, "atomic-end"}, + { OP_LOOK_BEHIND, "look-behind"}, + { OP_LOOK_BEHIND_NOT_START, "look-behind-not-start"}, + { OP_LOOK_BEHIND_NOT_END, "look-behind-not-end"}, + { OP_PUSH_SAVE_VAL, "push-save-val"}, + { OP_UPDATE_VAR, "update-var"}, +#ifdef USE_CALL + { OP_CALL, "call"}, + { OP_RETURN, "return"}, #endif - { OP_PUSH_IF_PEEK_NEXT, "push-if-peek-next" }, - { OP_REPEAT, "repeat" }, - { OP_REPEAT_NG, "repeat-ng" }, - { OP_REPEAT_INC, "repeat-inc" }, - { OP_REPEAT_INC_NG, "repeat-inc-ng" }, - { OP_REPEAT_INC_SG, "repeat-inc-sg" }, - { OP_REPEAT_INC_NG_SG, "repeat-inc-ng-sg" }, - { OP_EMPTY_CHECK_START, "empty-check-start" }, - { OP_EMPTY_CHECK_END, "empty-check-end" }, - { OP_EMPTY_CHECK_END_MEMST, "empty-check-end-memst" }, - { OP_EMPTY_CHECK_END_MEMST_PUSH,"empty-check-end-memst-push" }, - { OP_PREC_READ_START, "push-pos" }, - { OP_PREC_READ_END, "pop-pos" }, - { OP_PREC_READ_NOT_START, "prec-read-not-start" }, - { OP_PREC_READ_NOT_END, "prec-read-not-end" }, - { OP_ATOMIC_START, "atomic-start" }, - { OP_ATOMIC_END, "atomic-end" }, - { OP_LOOK_BEHIND, "look-behind" }, - { OP_LOOK_BEHIND_NOT_START, "look-behind-not-start" }, - { OP_LOOK_BEHIND_NOT_END, "look-behind-not-end" }, - { OP_CALL, "call" }, - { OP_RETURN, "return" }, - { OP_PUSH_SAVE_VAL, "push-save-val" }, - { OP_UPDATE_VAR, "update-var" }, #ifdef USE_CALLOUT - { OP_CALLOUT_CONTENTS, "callout-contents" }, - { OP_CALLOUT_NAME, "callout-name" }, + { OP_CALLOUT_CONTENTS, "callout-contents"}, + { OP_CALLOUT_NAME, "callout-name"}, #endif - { -1, "" } + { -1, ""} }; static char* @@ -320,32 +340,32 @@ print_compiled_byte_code(FILE* f, regex_t* reg, int index, fprintf(f, "%s", op2name(opcode)); switch (opcode) { - case OP_EXACT1: + case OP_STR_1: p_string(f, 1, p->exact.s); break; - case OP_EXACT2: + case OP_STR_2: p_string(f, 2, p->exact.s); break; - case OP_EXACT3: + case OP_STR_3: p_string(f, 3, p->exact.s); break; - case OP_EXACT4: + case OP_STR_4: p_string(f, 4, p->exact.s); break; - case OP_EXACT5: + case OP_STR_5: p_string(f, 5, p->exact.s); break; - case OP_EXACTN: + case OP_STR_N: len = p->exact_n.n; p_string(f, len, p->exact_n.s); break; - case OP_EXACTMB2N1: + case OP_STR_MB2N1: p_string(f, 2, p->exact.s); break; - case OP_EXACTMB2N2: + case OP_STR_MB2N2: p_string(f, 4, p->exact.s); break; - case OP_EXACTMB2N3: + case OP_STR_MB2N3: p_string(f, 3, p->exact.s); break; - case OP_EXACTMB2N: + case OP_STR_MB2N: len = p->exact_n.n; p_len_string(f, len, 2, p->exact_n.s); break; - case OP_EXACTMB3N: + case OP_STR_MB3N: len = p->exact_n.n; p_len_string(f, len, 3, p->exact_n.s); break; - case OP_EXACTMBN: + case OP_STR_MBN: { int mb_len; @@ -357,11 +377,11 @@ print_compiled_byte_code(FILE* f, regex_t* reg, int index, while (n-- > 0) { fputc(*q++, f); } } break; - case OP_EXACT1_IC: + case OP_STR_1_IC: len = enclen(enc, p->exact.s); p_string(f, len, p->exact.s); break; - case OP_EXACTN_IC: + case OP_STR_N_IC: len = p->exact_n.n; p_len_string(f, len, 1, p->exact_n.s); break; @@ -375,13 +395,13 @@ print_compiled_byte_code(FILE* f, regex_t* reg, int index, case OP_CCLASS_MB_NOT: { OnigCodePoint ncode; - OnigCodePoint* codes; + OnigCodePoint* codes; codes = (OnigCodePoint* )p->cclass_mb.mb; GET_CODE_POINT(ncode, codes); codes++; GET_CODE_POINT(code, codes); - fprintf(f, ":%u:%u", code, ncode); + fprintf(f, ":%d:0x%x", ncode, code); } break; case OP_CCLASS_MIX: @@ -447,15 +467,18 @@ print_compiled_byte_code(FILE* f, regex_t* reg, int index, } break; - case OP_MEMORY_START: - case OP_MEMORY_START_PUSH: + case OP_MEM_START: + case OP_MEM_START_PUSH: mem = p->memory_start.num; fprintf(f, ":%d", mem); break; - case OP_MEMORY_END_PUSH: - case OP_MEMORY_END_PUSH_REC: - case OP_MEMORY_END: - case OP_MEMORY_END_REC: + + case OP_MEM_END: + case OP_MEM_END_PUSH: +#ifdef USE_CALL + case OP_MEM_END_REC: + case OP_MEM_END_PUSH_REC: +#endif mem = p->memory_end.num; fprintf(f, ":%d", mem); break; @@ -499,8 +522,6 @@ print_compiled_byte_code(FILE* f, regex_t* reg, int index, case OP_REPEAT_INC: case OP_REPEAT_INC_NG: - case OP_REPEAT_INC_SG: - case OP_REPEAT_INC_NG_SG: mem = p->repeat.id; fprintf(f, ":%d", mem); break; @@ -511,7 +532,9 @@ print_compiled_byte_code(FILE* f, regex_t* reg, int index, break; case OP_EMPTY_CHECK_END: case OP_EMPTY_CHECK_END_MEMST: +#ifdef USE_CALL case OP_EMPTY_CHECK_END_MEMST_PUSH: +#endif mem = p->empty_check_end.mem; fprintf(f, ":%d", mem); break; @@ -534,10 +557,12 @@ print_compiled_byte_code(FILE* f, regex_t* reg, int index, p_rel_addr(f, addr, p, start); break; +#ifdef USE_CALL case OP_CALL: addr = p->call.addr; fprintf(f, ":{/%d}", addr); break; +#endif case OP_PUSH_SAVE_VAL: { @@ -607,7 +632,9 @@ print_compiled_byte_code(FILE* f, regex_t* reg, int index, case OP_ATOMIC_START: case OP_ATOMIC_END: case OP_LOOK_BEHIND_NOT_END: +#ifdef USE_CALL case OP_RETURN: +#endif break; default: @@ -615,7 +642,7 @@ print_compiled_byte_code(FILE* f, regex_t* reg, int index, break; } } -#endif /* ONIG_DEBUG */ +#endif /* defined(ONIG_DEBUG_COMPILE) || defined(ONIG_DEBUG_MATCH) */ #ifdef ONIG_DEBUG_COMPILE extern void @@ -625,8 +652,8 @@ onig_print_compiled_byte_code_list(FILE* f, regex_t* reg) Operation* start = reg->ops; Operation* end = reg->ops + reg->ops_used; - fprintf(f, "bt_mem_start: 0x%x, bt_mem_end: 0x%x\n", - reg->bt_mem_start, reg->bt_mem_end); + fprintf(f, "push_mem_start: 0x%x, push_mem_end: 0x%x\n", + reg->push_mem_start, reg->push_mem_end); fprintf(f, "code-length: %d\n", reg->ops_used); bp = start; @@ -943,7 +970,7 @@ onig_region_copy(OnigRegion* to, OnigRegion* from) result = ONIGERR_INVALID_ARGUMENT;\ }\ best_len = result;\ - goto finish;\ + goto match_at_end;\ break;\ }\ } while(0) @@ -965,18 +992,26 @@ onig_region_copy(OnigRegion* to, OnigRegion* from) /* handled by normal-POP */ #define STK_MEM_START 0x0010 #define STK_MEM_END 0x8030 -#define STK_REPEAT_INC 0x0050 +#ifdef USE_REPEAT_AND_EMPTY_CHECK_LOCAL_VAR +#define STK_REPEAT_INC (0x0040 | STK_MASK_POP_HANDLED) +#else +#define STK_REPEAT_INC 0x0040 +#endif #ifdef USE_CALLOUT #define STK_CALLOUT 0x0070 #endif /* avoided by normal-POP */ #define STK_VOID 0x0000 /* for fill a blank */ +#ifdef USE_REPEAT_AND_EMPTY_CHECK_LOCAL_VAR +#define STK_EMPTY_CHECK_START (0x3000 | STK_MASK_POP_HANDLED) +#else #define STK_EMPTY_CHECK_START 0x3000 +#endif #define STK_EMPTY_CHECK_END 0x5000 /* for recursive call */ #define STK_MEM_END_MARK 0x8100 #define STK_TO_VOID_START 0x1200 /* mark for "(?>...)" */ -#define STK_REPEAT 0x0300 +/* #define STK_REPEAT 0x0300 */ #define STK_CALL_FRAME 0x0400 #define STK_RETURN 0x0500 #define STK_SAVE_VAL 0x0600 @@ -1002,11 +1037,10 @@ typedef struct _StackType { UChar* pstr_prev; /* previous char position of pstr */ } state; struct { - int count; /* for OP_REPEAT_INC, OP_REPEAT_INC_NG */ - Operation* pcode; /* byte code position (head of repeated target) */ - } repeat; - struct { - StackIndex si; /* index of stack */ + int count; +#ifdef USE_REPEAT_AND_EMPTY_CHECK_LOCAL_VAR + StackIndex prev_index; /* index of stack */ +#endif } repeat_inc; struct { UChar *pstr; /* start/end position */ @@ -1015,7 +1049,10 @@ typedef struct _StackType { StackIndex prev_end; /* prev. info (for backtrack "(...)*" ) */ } mem; struct { - UChar *pstr; /* start position */ + UChar *pstr; /* start position */ +#ifdef USE_REPEAT_AND_EMPTY_CHECK_LOCAL_VAR + StackIndex prev_index; /* index of stack */ +#endif } empty_check; #ifdef USE_CALL struct { @@ -1061,29 +1098,64 @@ struct OnigCalloutArgsStruct { #endif +#ifdef USE_REPEAT_AND_EMPTY_CHECK_LOCAL_VAR + +#define PTR_NUM_SIZE(reg) ((reg)->num_repeat + (reg)->num_empty_check + ((reg)->num_mem + 1) * 2) +#define UPDATE_FOR_STACK_REALLOC do{\ + repeat_stk = (StackIndex* )alloc_base;\ + empty_check_stk = (StackIndex* )(repeat_stk + reg->num_repeat);\ + mem_start_stk = (StackIndex* )(empty_check_stk + reg->num_empty_check);\ + mem_end_stk = mem_start_stk + num_mem + 1;\ +} while(0) + +#define SAVE_REPEAT_STK_VAR(sid) stk->u.repeat_inc.prev_index = repeat_stk[sid] +#define LOAD_TO_REPEAT_STK_VAR(sid) repeat_stk[sid] = GET_STACK_INDEX(stk) +#define POP_REPEAT_INC else if (stk->type == STK_REPEAT_INC) {repeat_stk[stk->zid] = stk->u.repeat_inc.prev_index;} + +#define SAVE_EMPTY_CHECK_STK_VAR(sid) stk->u.empty_check.prev_index = empty_check_stk[sid] +#define LOAD_TO_EMPTY_CHECK_STK_VAR(sid) empty_check_stk[sid] = GET_STACK_INDEX(stk) +#define POP_EMPTY_CHECK_START else if (stk->type == STK_EMPTY_CHECK_START) {empty_check_stk[stk->zid] = stk->u.empty_check.prev_index;} + +#else + +#define PTR_NUM_SIZE(reg) (((reg)->num_mem + 1) * 2) +#define UPDATE_FOR_STACK_REALLOC do{\ + mem_start_stk = (StackIndex* )alloc_base;\ + mem_end_stk = mem_start_stk + num_mem + 1;\ +} while(0) + +#define SAVE_REPEAT_STK_VAR(sid) +#define LOAD_TO_REPEAT_STK_VAR(sid) +#define POP_REPEAT_INC + +#define SAVE_EMPTY_CHECK_STK_VAR(sid) +#define LOAD_TO_EMPTY_CHECK_STK_VAR(sid) +#define POP_EMPTY_CHECK_START + +#endif /* USE_REPEAT_AND_EMPTY_CHECK_LOCAL_VAR */ #ifdef USE_FIND_LONGEST_SEARCH_ALL_OF_RANGE -#define MATCH_ARG_INIT(msa, reg, arg_option, arg_region, arg_start, mp) do { \ +#define MATCH_ARG_INIT(msa, reg, arg_option, arg_region, arg_start, mpv) do { \ (msa).stack_p = (void* )0;\ (msa).options = (arg_option);\ (msa).region = (arg_region);\ (msa).start = (arg_start);\ - (msa).match_stack_limit = (mp)->match_stack_limit;\ - (msa).retry_limit_in_match = (mp)->retry_limit_in_match;\ - (msa).mp = mp;\ + (msa).match_stack_limit = (mpv)->match_stack_limit;\ + (msa).retry_limit_in_match = (mpv)->retry_limit_in_match;\ + (msa).mp = mpv;\ (msa).best_len = ONIG_MISMATCH;\ - (msa).ptr_num = (reg)->num_repeat + ((reg)->num_mem + 1) * 2; \ + (msa).ptr_num = PTR_NUM_SIZE(reg);\ } while(0) #else -#define MATCH_ARG_INIT(msa, reg, arg_option, arg_region, arg_start, mp) do { \ +#define MATCH_ARG_INIT(msa, reg, arg_option, arg_region, arg_start, mpv) do { \ (msa).stack_p = (void* )0;\ (msa).options = (arg_option);\ (msa).region = (arg_region);\ (msa).start = (arg_start);\ - (msa).match_stack_limit = (mp)->match_stack_limit;\ - (msa).retry_limit_in_match = (mp)->retry_limit_in_match;\ - (msa).mp = mp;\ - (msa).ptr_num = (reg)->num_repeat + ((reg)->num_mem + 1) * 2; \ + (msa).match_stack_limit = (mpv)->match_stack_limit;\ + (msa).retry_limit_in_match = (mpv)->retry_limit_in_match;\ + (msa).mp = mpv;\ + (msa).ptr_num = PTR_NUM_SIZE(reg);\ } while(0) #endif @@ -1138,12 +1210,6 @@ struct OnigCalloutArgsStruct { };\ } while(0) -#define UPDATE_FOR_STACK_REALLOC do{\ - repeat_stk = (StackIndex* )alloc_base;\ - mem_start_stk = (StackIndex* )(repeat_stk + reg->num_repeat);\ - mem_end_stk = mem_start_stk + num_mem + 1;\ -} while(0) - static unsigned int MatchStackLimit = DEFAULT_MATCH_STACK_LIMIT_SIZE; extern unsigned int @@ -1164,7 +1230,9 @@ onig_set_match_stack_limit_size(unsigned int size) static unsigned long RetryLimitInMatch = DEFAULT_RETRY_LIMIT_IN_MATCH; #define CHECK_RETRY_LIMIT_IN_MATCH do {\ - if (retry_in_match_counter++ > retry_limit_in_match) goto retry_limit_in_match_over;\ + if (retry_in_match_counter++ > retry_limit_in_match) {\ + MATCH_AT_ERROR_RETURN(ONIGERR_RETRY_LIMIT_IN_MATCH_OVER);\ + }\ } while (0) #else @@ -1554,19 +1622,23 @@ stack_double(int is_alloca, char** arg_alloc_base, #define STACK_PUSH_ALT_LOOK_BEHIND_NOT(pat,s,sprev) \ STACK_PUSH(STK_ALT_LOOK_BEHIND_NOT,pat,s,sprev) +#if 0 #define STACK_PUSH_REPEAT(sid, pat) do {\ STACK_ENSURE(1);\ stk->type = STK_REPEAT;\ stk->zid = (sid);\ - stk->u.repeat.pcode = (pat);\ - stk->u.repeat.count = 0;\ + stk->u.repeat.pcode = (pat);\ STACK_INC;\ } while(0) +#endif -#define STACK_PUSH_REPEAT_INC(sindex) do {\ +#define STACK_PUSH_REPEAT_INC(sid, ct) do {\ STACK_ENSURE(1);\ stk->type = STK_REPEAT_INC;\ - stk->u.repeat_inc.si = (sindex);\ + stk->zid = (sid);\ + stk->u.repeat_inc.count = (ct);\ + SAVE_REPEAT_STK_VAR(sid);\ + LOAD_TO_REPEAT_STK_VAR(sid);\ STACK_INC;\ } while(0) @@ -1639,6 +1711,8 @@ stack_double(int is_alloca, char** arg_alloc_base, stk->type = STK_EMPTY_CHECK_START;\ stk->zid = (cnum);\ stk->u.empty_check.pstr = (s);\ + SAVE_EMPTY_CHECK_STK_VAR(cnum);\ + LOAD_TO_EMPTY_CHECK_STK_VAR(cnum);\ STACK_INC;\ } while(0) @@ -1776,7 +1850,7 @@ stack_double(int is_alloca, char** arg_alloc_base, #define STACK_BASE_CHECK(p, at) \ if ((p) < stk_base) {\ fprintf(stderr, "at %s\n", at);\ - goto stack_error;\ + MATCH_AT_ERROR_RETURN(ONIGERR_STACK_BUG);\ } #else #define STACK_BASE_CHECK(p, at) @@ -1827,13 +1901,12 @@ stack_double(int is_alloca, char** arg_alloc_base, mem_start_stk[stk->zid] = stk->u.mem.prev_start;\ mem_end_stk[stk->zid] = stk->u.mem.prev_end;\ }\ - else if (stk->type == STK_REPEAT_INC) {\ - STACK_AT(stk->u.repeat_inc.si)->u.repeat.count--;\ - }\ else if (stk->type == STK_MEM_END) {\ mem_start_stk[stk->zid] = stk->u.mem.prev_start;\ mem_end_stk[stk->zid] = stk->u.mem.prev_end;\ }\ + POP_REPEAT_INC \ + POP_EMPTY_CHECK_START \ POP_CALLOUT_CASE\ }\ }\ @@ -1852,13 +1925,12 @@ stack_double(int is_alloca, char** arg_alloc_base, mem_start_stk[stk->zid] = stk->u.mem.prev_start;\ mem_end_stk[stk->zid] = stk->u.mem.prev_end;\ }\ - else if (stk->type == STK_REPEAT_INC) {\ - STACK_AT(stk->u.repeat_inc.si)->u.repeat.count--;\ - }\ else if (stk->type == STK_MEM_END) {\ mem_start_stk[stk->zid] = stk->u.mem.prev_start;\ mem_end_stk[stk->zid] = stk->u.mem.prev_end;\ }\ + POP_REPEAT_INC \ + POP_EMPTY_CHECK_START \ /* Don't call callout here because negation of total success by (?!..) (?type == STK_EMPTY_CHECK_START) {\ - if (k->zid == (sid)) {\ - (isnull) = (k->u.empty_check.pstr == (s));\ - break;\ - }\ + if (k->zid == (sid)) break;\ }\ }\ } while(0) +#ifdef USE_REPEAT_AND_EMPTY_CHECK_LOCAL_VAR + +#define GET_EMPTY_CHECK_START(sid, k) do {\ + if (reg->num_call == 0) {\ + k = STACK_AT(empty_check_stk[sid]);\ + }\ + else {\ + EMPTY_CHECK_START_SEARCH(sid, k);\ + }\ +} while(0) +#else + +#define GET_EMPTY_CHECK_START(sid, k) EMPTY_CHECK_START_SEARCH(sid, k) + +#endif + + +#define STACK_EMPTY_CHECK(isnull, sid, s) do {\ + StackType* k;\ + GET_EMPTY_CHECK_START(sid, k);\ + (isnull) = (k->u.empty_check.pstr == (s));\ +} while(0) + #define STACK_MEM_START_GET_PREV_END_ADDR(k /* STK_MEM_START*/, reg, addr) do {\ if (k->u.mem.prev_end == INVALID_STACK_INDEX) {\ (addr) = 0;\ }\ else {\ - if (MEM_STATUS_AT((reg)->bt_mem_end, k->zid))\ + if (MEM_STATUS_AT((reg)->push_mem_end, k->zid))\ (addr) = STACK_AT(k->u.mem.prev_end)->u.mem.pstr;\ else\ (addr) = (UChar* )k->u.mem.prev_end;\ @@ -1937,45 +2030,30 @@ stack_double(int is_alloca, char** arg_alloc_base, } while (0) #ifdef USE_STUBBORN_CHECK_CAPTURES_IN_EMPTY_REPEAT -#define STACK_EMPTY_CHECK_MEM(isnull,sid,s,reg) do {\ - StackType* k = stk;\ - while (1) {\ - k--;\ - STACK_BASE_CHECK(k, "STACK_EMPTY_CHECK_MEM"); \ - if (k->type == STK_EMPTY_CHECK_START) {\ - if (k->zid == (sid)) {\ - if (k->u.empty_check.pstr != (s)) {\ - (isnull) = 0;\ - break;\ +#define STACK_EMPTY_CHECK_MEM(isnull, sid, s, reg) do {\ + StackType* k;\ + GET_EMPTY_CHECK_START(sid, k);\ + if (k->u.empty_check.pstr != (s)) {\ + (isnull) = 0;\ + }\ + else {\ + UChar* endp;\ + (isnull) = 1;\ + while (k < stk) {\ + if (k->type == STK_MEM_START &&\ + MEM_STATUS_LIMIT_AT((reg)->empty_status_mem, k->zid)) {\ + STACK_MEM_START_GET_PREV_END_ADDR(k, reg, endp);\ + if (endp == 0) {\ + (isnull) = 0; break;\ }\ - else {\ - UChar* endp;\ - int level = 0;\ - (isnull) = 1;\ - while (k < stk) {\ - if (k->type == STK_MEM_START && level == 0) {\ - STACK_MEM_START_GET_PREV_END_ADDR(k, reg, endp);\ - if (endp == 0) {\ - (isnull) = 0; break;\ - }\ - else if (STACK_AT(k->u.mem.prev_start)->u.mem.pstr != endp) {\ - (isnull) = 0; break;\ - }\ - else if (endp != s) {\ - (isnull) = -1; /* empty, but position changed */ \ - }\ - }\ - else if (k->type == STK_PREC_READ_START) {\ - level++;\ - }\ - else if (k->type == STK_PREC_READ_END) {\ - level--;\ - }\ - k++;\ - }\ - break;\ + else if (STACK_AT(k->u.mem.prev_start)->u.mem.pstr != endp) {\ + (isnull) = 0; break;\ + }\ + else if (endp != s) {\ + (isnull) = -1; /* empty, but position changed */ \ }\ }\ + k++;\ }\ }\ } while(0) @@ -1995,11 +2073,11 @@ stack_double(int is_alloca, char** arg_alloc_base, }\ else {\ UChar* endp;\ - int prec_level = 0;\ (isnull) = 1;\ while (k < stk) {\ if (k->type == STK_MEM_START) {\ - if (level == 0 && prec_level == 0) {\ + if (level == 0 && \ + MEM_STATUS_LIMIT_AT((reg)->empty_status_mem, k->zid) !=0) {\ STACK_MEM_START_GET_PREV_END_ADDR(k, reg, endp);\ if (endp == 0) {\ (isnull) = 0; break;\ @@ -2018,12 +2096,6 @@ stack_double(int is_alloca, char** arg_alloc_base, else if (k->type == STK_EMPTY_CHECK_END) {\ if (k->zid == (sid)) level--;\ }\ - else if (k->type == STK_PREC_READ_START) {\ - prec_level++;\ - }\ - else if (k->type == STK_PREC_READ_END) {\ - prec_level--;\ - }\ k++;\ }\ break;\ @@ -2062,24 +2134,45 @@ stack_double(int is_alloca, char** arg_alloc_base, } while(0) #endif /* USE_STUBBORN_CHECK_CAPTURES_IN_EMPTY_REPEAT */ -#define STACK_GET_REPEAT(sid, k) do {\ - int level = 0;\ - k = stk;\ +#define STACK_GET_REPEAT_COUNT_SEARCH(sid, c) do {\ + StackType* k = stk;\ while (1) {\ - k--;\ - STACK_BASE_CHECK(k, "STACK_GET_REPEAT"); \ - if (k->type == STK_REPEAT) {\ - if (level == 0) {\ - if (k->zid == (sid)) {\ - break;\ + (k)--;\ + STACK_BASE_CHECK(k, "STACK_GET_REPEAT_COUNT_SEARCH");\ + if ((k)->type == STK_REPEAT_INC) {\ + if ((k)->zid == (sid)) {\ + (c) = (k)->u.repeat_inc.count;\ + break;\ + }\ + }\ + else if ((k)->type == STK_RETURN) {\ + int level = -1;\ + while (1) {\ + (k)--;\ + if ((k)->type == STK_CALL_FRAME) {\ + level++;\ + if (level == 0) break;\ }\ + else if ((k)->type == STK_RETURN) level--;\ }\ }\ - else if (k->type == STK_CALL_FRAME) level--;\ - else if (k->type == STK_RETURN) level++;\ }\ } while(0) +#ifdef USE_REPEAT_AND_EMPTY_CHECK_LOCAL_VAR + +#define STACK_GET_REPEAT_COUNT(sid, c) do {\ + if (reg->num_call == 0) {\ + (c) = (STACK_AT(repeat_stk[sid]))->u.repeat_inc.count;\ + }\ + else {\ + STACK_GET_REPEAT_COUNT_SEARCH(sid, c);\ + }\ +} while(0) +#else +#define STACK_GET_REPEAT_COUNT(sid, c) STACK_GET_REPEAT_COUNT_SEARCH(sid, c) +#endif + #define STACK_RETURN(addr) do {\ int level = 0;\ StackType* k = stk;\ @@ -2481,6 +2574,8 @@ typedef struct { #define MATCH_DEBUG_OUT(offset) #endif +#define MATCH_AT_ERROR_RETURN(err_code) best_len = err_code; goto match_at_end + /* match data(str - end) from position (sstart). */ /* if sstart == str then set sprev to NULL. */ @@ -2500,20 +2595,20 @@ match_at(regex_t* reg, const UChar* str, const UChar* end, static const void *opcode_to_label[] = { &&L_FINISH, &&L_END, - &&L_EXACT1, - &&L_EXACT2, - &&L_EXACT3, - &&L_EXACT4, - &&L_EXACT5, - &&L_EXACTN, - &&L_EXACTMB2N1, - &&L_EXACTMB2N2, - &&L_EXACTMB2N3, - &&L_EXACTMB2N, - &&L_EXACTMB3N, - &&L_EXACTMBN, - &&L_EXACT1_IC, - &&L_EXACTN_IC, + &&L_STR_1, + &&L_STR_2, + &&L_STR_3, + &&L_STR_4, + &&L_STR_5, + &&L_STR_N, + &&L_STR_MB2N1, + &&L_STR_MB2N2, + &&L_STR_MB2N3, + &&L_STR_MB2N, + &&L_STR_MB3N, + &&L_STR_MBN, + &&L_STR_1_IC, + &&L_STR_N_IC, &&L_CCLASS, &&L_CCLASS_MB, &&L_CCLASS_MIX, @@ -2551,12 +2646,16 @@ match_at(regex_t* reg, const UChar* str, const UChar* end, &&L_BACKREF_WITH_LEVEL_IC, &&L_BACKREF_CHECK, &&L_BACKREF_CHECK_WITH_LEVEL, - &&L_MEMORY_START, - &&L_MEMORY_START_PUSH, - &&L_MEMORY_END_PUSH, - &&L_MEMORY_END_PUSH_REC, - &&L_MEMORY_END, - &&L_MEMORY_END_REC, + &&L_MEM_START, + &&L_MEM_START_PUSH, + &&L_MEM_END_PUSH, +#ifdef USE_CALL + &&L_MEM_END_PUSH_REC, +#endif + &&L_MEM_END, +#ifdef USE_CALL + &&L_MEM_END_REC, +#endif &&L_FAIL, &&L_JUMP, &&L_PUSH, @@ -2570,12 +2669,12 @@ match_at(regex_t* reg, const UChar* str, const UChar* end, &&L_REPEAT_NG, &&L_REPEAT_INC, &&L_REPEAT_INC_NG, - &&L_REPEAT_INC_SG, - &&L_REPEAT_INC_NG_SG, &&L_EMPTY_CHECK_START, &&L_EMPTY_CHECK_END, &&L_EMPTY_CHECK_END_MEMST, +#ifdef USE_CALL &&L_EMPTY_CHECK_END_MEMST_PUSH, +#endif &&L_PREC_READ_START, &&L_PREC_READ_END, &&L_PREC_READ_NOT_START, @@ -2585,10 +2684,12 @@ match_at(regex_t* reg, const UChar* str, const UChar* end, &&L_LOOK_BEHIND, &&L_LOOK_BEHIND_NOT_START, &&L_LOOK_BEHIND_NOT_END, - &&L_CALL, - &&L_RETURN, &&L_PUSH_SAVE_VAL, &&L_UPDATE_VAR, +#ifdef USE_CALL + &&L_CALL, + &&L_RETURN, +#endif #ifdef USE_CALLOUT &&L_CALLOUT_CONTENTS, &&L_CALLOUT_NAME, @@ -2606,15 +2707,17 @@ match_at(regex_t* reg, const UChar* str, const UChar* end, char *alloc_base; StackType *stk_base, *stk, *stk_end; StackType *stkp; /* used as any purpose. */ - StackIndex si; - StackIndex *repeat_stk; StackIndex *mem_start_stk, *mem_end_stk; UChar* keep; + +#ifdef USE_REPEAT_AND_EMPTY_CHECK_LOCAL_VAR + StackIndex *repeat_stk; + StackIndex *empty_check_stk; +#endif #ifdef USE_RETRY_LIMIT_IN_MATCH unsigned long retry_limit_in_match; unsigned long retry_in_match_counter; #endif - #ifdef USE_CALLOUT int of; #endif @@ -2700,15 +2803,8 @@ match_at(regex_t* reg, const UChar* str, const UChar* end, rmt[0].rm_eo = (regoff_t )(s - str); for (i = 1; i <= num_mem; i++) { if (mem_end_stk[i] != INVALID_STACK_INDEX) { - if (MEM_STATUS_AT(reg->bt_mem_start, i)) - rmt[i].rm_so = (regoff_t )(STACK_AT(mem_start_stk[i])->u.mem.pstr - str); - else - rmt[i].rm_so = (regoff_t )((UChar* )((void* )(mem_start_stk[i])) - str); - - rmt[i].rm_eo = (regoff_t )((MEM_STATUS_AT(reg->bt_mem_end, i) - ? STACK_AT(mem_end_stk[i])->u.mem.pstr - : (UChar* )((void* )mem_end_stk[i])) - - str); + rmt[i].rm_so = (regoff_t )(STACK_MEM_START(reg, i) - str); + rmt[i].rm_eo = (regoff_t )(STACK_MEM_END(reg, i) - str); } else { rmt[i].rm_so = rmt[i].rm_eo = ONIG_REGION_NOTPOS; @@ -2721,14 +2817,8 @@ match_at(regex_t* reg, const UChar* str, const UChar* end, region->end[0] = (int )(s - str); for (i = 1; i <= num_mem; i++) { if (mem_end_stk[i] != INVALID_STACK_INDEX) { - if (MEM_STATUS_AT(reg->bt_mem_start, i)) - region->beg[i] = (int )(STACK_AT(mem_start_stk[i])->u.mem.pstr - str); - else - region->beg[i] = (int )((UChar* )((void* )mem_start_stk[i]) - str); - - region->end[i] = (int )((MEM_STATUS_AT(reg->bt_mem_end, i) - ? STACK_AT(mem_end_stk[i])->u.mem.pstr - : (UChar* )((void* )mem_end_stk[i])) - str); + region->beg[i] = (int )(STACK_MEM_START(reg, i) - str); + region->end[i] = (int )(STACK_MEM_END(reg, i) - str); } else { region->beg[i] = region->end[i] = ONIG_REGION_NOTPOS; @@ -2756,10 +2846,7 @@ match_at(regex_t* reg, const UChar* str, const UChar* end, stkp = stk_base; r = make_capture_history_tree(region->history_root, &stkp, stk, (UChar* )str, reg); - if (r < 0) { - best_len = r; /* error code */ - goto finish; - } + if (r < 0) MATCH_AT_ERROR_RETURN(r); } #endif /* USE_CAPTURE_HISTORY */ #ifdef USE_POSIX_API_REGION_OPTION @@ -2784,9 +2871,9 @@ match_at(regex_t* reg, const UChar* str, const UChar* end, } /* default behavior: return first-matching result. */ - goto finish; + goto match_at_end; - CASE_OP(EXACT1) + CASE_OP(STR_1) DATA_ENSURE(1); ps = p->exact.s; if (*ps != *s) goto fail; @@ -2794,7 +2881,7 @@ match_at(regex_t* reg, const UChar* str, const UChar* end, INC_OP; NEXT_OUT; - CASE_OP(EXACT1_IC) + CASE_OP(STR_1_IC) { int len; UChar *q, lowbuf[ONIGENC_MBC_CASE_FOLD_MAXLEN]; @@ -2815,7 +2902,7 @@ match_at(regex_t* reg, const UChar* str, const UChar* end, INC_OP; NEXT_OUT; - CASE_OP(EXACT2) + CASE_OP(STR_2) DATA_ENSURE(2); ps = p->exact.s; if (*ps != *s) goto fail; @@ -2826,7 +2913,7 @@ match_at(regex_t* reg, const UChar* str, const UChar* end, INC_OP; JUMP_OUT; - CASE_OP(EXACT3) + CASE_OP(STR_3) DATA_ENSURE(3); ps = p->exact.s; if (*ps != *s) goto fail; @@ -2839,7 +2926,7 @@ match_at(regex_t* reg, const UChar* str, const UChar* end, INC_OP; JUMP_OUT; - CASE_OP(EXACT4) + CASE_OP(STR_4) DATA_ENSURE(4); ps = p->exact.s; if (*ps != *s) goto fail; @@ -2854,7 +2941,7 @@ match_at(regex_t* reg, const UChar* str, const UChar* end, INC_OP; JUMP_OUT; - CASE_OP(EXACT5) + CASE_OP(STR_5) DATA_ENSURE(5); ps = p->exact.s; if (*ps != *s) goto fail; @@ -2871,7 +2958,7 @@ match_at(regex_t* reg, const UChar* str, const UChar* end, INC_OP; JUMP_OUT; - CASE_OP(EXACTN) + CASE_OP(STR_N) tlen = p->exact_n.n; DATA_ENSURE(tlen); ps = p->exact_n.s; @@ -2882,7 +2969,7 @@ match_at(regex_t* reg, const UChar* str, const UChar* end, INC_OP; JUMP_OUT; - CASE_OP(EXACTN_IC) + CASE_OP(STR_N_IC) { int len; UChar *q, *endp, lowbuf[ONIGENC_MBC_CASE_FOLD_MAXLEN]; @@ -2900,6 +2987,7 @@ match_at(regex_t* reg, const UChar* str, const UChar* end, DATA_ENSURE(0); q = lowbuf; while (len-- > 0) { + if (ps >= endp) goto fail; if (*ps != *q) goto fail; ps++; q++; } @@ -2909,7 +2997,7 @@ match_at(regex_t* reg, const UChar* str, const UChar* end, INC_OP; JUMP_OUT; - CASE_OP(EXACTMB2N1) + CASE_OP(STR_MB2N1) DATA_ENSURE(2); ps = p->exact.s; if (*ps != *s) goto fail; @@ -2919,7 +3007,7 @@ match_at(regex_t* reg, const UChar* str, const UChar* end, INC_OP; NEXT_OUT; - CASE_OP(EXACTMB2N2) + CASE_OP(STR_MB2N2) DATA_ENSURE(4); ps = p->exact.s; if (*ps != *s) goto fail; @@ -2934,7 +3022,7 @@ match_at(regex_t* reg, const UChar* str, const UChar* end, INC_OP; JUMP_OUT; - CASE_OP(EXACTMB2N3) + CASE_OP(STR_MB2N3) DATA_ENSURE(6); ps = p->exact.s; if (*ps != *s) goto fail; @@ -2953,7 +3041,7 @@ match_at(regex_t* reg, const UChar* str, const UChar* end, INC_OP; JUMP_OUT; - CASE_OP(EXACTMB2N) + CASE_OP(STR_MB2N) tlen = p->exact_n.n; DATA_ENSURE(tlen * 2); ps = p->exact_n.s; @@ -2967,7 +3055,7 @@ match_at(regex_t* reg, const UChar* str, const UChar* end, INC_OP; JUMP_OUT; - CASE_OP(EXACTMB3N) + CASE_OP(STR_MB3N) tlen = p->exact_n.n; DATA_ENSURE(tlen * 3); ps = p->exact_n.s; @@ -2983,7 +3071,7 @@ match_at(regex_t* reg, const UChar* str, const UChar* end, INC_OP; JUMP_OUT; - CASE_OP(EXACTMBN) + CASE_OP(STR_MBN) tlen = p->exact_len_n.len; /* mb byte len */ tlen2 = p->exact_len_n.n; /* number of chars */ tlen2 *= tlen; @@ -3014,7 +3102,6 @@ match_at(regex_t* reg, const UChar* str, const UChar* end, UChar *ss; int mb_len; - DATA_ENSURE(1); mb_len = enclen(encode, s); DATA_ENSURE(mb_len); ss = s; @@ -3303,7 +3390,7 @@ match_at(regex_t* reg, const UChar* str, const UChar* end, break; #endif default: - goto bytecode_error; + MATCH_AT_ERROR_RETURN(ONIGERR_UNDEFINED_BYTECODE); break; } @@ -3403,46 +3490,50 @@ match_at(regex_t* reg, const UChar* str, const UChar* end, INC_OP; JUMP_OUT; - CASE_OP(MEMORY_START_PUSH) + CASE_OP(MEM_START_PUSH) mem = p->memory_start.num; STACK_PUSH_MEM_START(mem, s); INC_OP; JUMP_OUT; - CASE_OP(MEMORY_START) + CASE_OP(MEM_START) mem = p->memory_start.num; mem_start_stk[mem] = (StackIndex )((void* )s); INC_OP; JUMP_OUT; - CASE_OP(MEMORY_END_PUSH) + CASE_OP(MEM_END_PUSH) mem = p->memory_end.num; STACK_PUSH_MEM_END(mem, s); INC_OP; JUMP_OUT; - CASE_OP(MEMORY_END) + CASE_OP(MEM_END) mem = p->memory_end.num; mem_end_stk[mem] = (StackIndex )((void* )s); INC_OP; JUMP_OUT; #ifdef USE_CALL - CASE_OP(MEMORY_END_PUSH_REC) - mem = p->memory_end.num; - STACK_GET_MEM_START(mem, stkp); /* should be before push mem-end. */ - si = GET_STACK_INDEX(stkp); - STACK_PUSH_MEM_END(mem, s); - mem_start_stk[mem] = si; - INC_OP; - JUMP_OUT; + CASE_OP(MEM_END_PUSH_REC) + { + StackIndex si; + + mem = p->memory_end.num; + STACK_GET_MEM_START(mem, stkp); /* should be before push mem-end. */ + si = GET_STACK_INDEX(stkp); + STACK_PUSH_MEM_END(mem, s); + mem_start_stk[mem] = si; + INC_OP; + JUMP_OUT; + } - CASE_OP(MEMORY_END_REC) + CASE_OP(MEM_END_REC) mem = p->memory_end.num; mem_end_stk[mem] = (StackIndex )((void* )s); STACK_GET_MEM_START(mem, stkp); - if (MEM_STATUS_AT(reg->bt_mem_start, mem)) + if (MEM_STATUS_AT(reg->push_mem_start, mem)) mem_start_stk[mem] = GET_STACK_INDEX(stkp); else mem_start_stk[mem] = (StackIndex )((void* )stkp->u.mem.pstr); @@ -3470,14 +3561,8 @@ match_at(regex_t* reg, const UChar* str, const UChar* end, if (mem_end_stk[mem] == INVALID_STACK_INDEX) goto fail; if (mem_start_stk[mem] == INVALID_STACK_INDEX) goto fail; - if (MEM_STATUS_AT(reg->bt_mem_start, mem)) - pstart = STACK_AT(mem_start_stk[mem])->u.mem.pstr; - else - pstart = (UChar* )((void* )mem_start_stk[mem]); - - pend = (MEM_STATUS_AT(reg->bt_mem_end, mem) - ? STACK_AT(mem_end_stk[mem])->u.mem.pstr - : (UChar* )((void* )mem_end_stk[mem])); + pstart = STACK_MEM_START(reg, mem); + pend = STACK_MEM_END(reg, mem); n = (int )(pend - pstart); if (n != 0) { DATA_ENSURE(n); @@ -3499,14 +3584,8 @@ match_at(regex_t* reg, const UChar* str, const UChar* end, if (mem_end_stk[mem] == INVALID_STACK_INDEX) goto fail; if (mem_start_stk[mem] == INVALID_STACK_INDEX) goto fail; - if (MEM_STATUS_AT(reg->bt_mem_start, mem)) - pstart = STACK_AT(mem_start_stk[mem])->u.mem.pstr; - else - pstart = (UChar* )((void* )mem_start_stk[mem]); - - pend = (MEM_STATUS_AT(reg->bt_mem_end, mem) - ? STACK_AT(mem_end_stk[mem])->u.mem.pstr - : (UChar* )((void* )mem_end_stk[mem])); + pstart = STACK_MEM_START(reg, mem); + pend = STACK_MEM_END(reg, mem); n = (int )(pend - pstart); if (n != 0) { DATA_ENSURE(n); @@ -3531,14 +3610,8 @@ match_at(regex_t* reg, const UChar* str, const UChar* end, if (mem_end_stk[mem] == INVALID_STACK_INDEX) continue; if (mem_start_stk[mem] == INVALID_STACK_INDEX) continue; - if (MEM_STATUS_AT(reg->bt_mem_start, mem)) - pstart = STACK_AT(mem_start_stk[mem])->u.mem.pstr; - else - pstart = (UChar* )((void* )mem_start_stk[mem]); - - pend = (MEM_STATUS_AT(reg->bt_mem_end, mem) - ? STACK_AT(mem_end_stk[mem])->u.mem.pstr - : (UChar* )((void* )mem_end_stk[mem])); + pstart = STACK_MEM_START(reg, mem); + pend = STACK_MEM_END(reg, mem); n = (int )(pend - pstart); if (n != 0) { DATA_ENSURE(n); @@ -3569,14 +3642,8 @@ match_at(regex_t* reg, const UChar* str, const UChar* end, if (mem_end_stk[mem] == INVALID_STACK_INDEX) continue; if (mem_start_stk[mem] == INVALID_STACK_INDEX) continue; - if (MEM_STATUS_AT(reg->bt_mem_start, mem)) - pstart = STACK_AT(mem_start_stk[mem])->u.mem.pstr; - else - pstart = (UChar* )((void* )mem_start_stk[mem]); - - pend = (MEM_STATUS_AT(reg->bt_mem_end, mem) - ? STACK_AT(mem_end_stk[mem])->u.mem.pstr - : (UChar* )((void* )mem_end_stk[mem])); + pstart = STACK_MEM_START(reg, mem); + pend = STACK_MEM_END(reg, mem); n = (int )(pend - pstart); if (n != 0) { DATA_ENSURE(n); @@ -3689,12 +3756,10 @@ match_at(regex_t* reg, const UChar* str, const UChar* end, case OP_PUSH: case OP_REPEAT_INC: case OP_REPEAT_INC_NG: - case OP_REPEAT_INC_SG: - case OP_REPEAT_INC_NG_SG: INC_OP; break; default: - goto unexpected_bytecode_error; + MATCH_AT_ERROR_RETURN(ONIGERR_UNEXPECTED_BYTECODE); break; } #else @@ -3797,7 +3862,7 @@ match_at(regex_t* reg, const UChar* str, const UChar* end, addr = p->push_if_peek_next.addr; c = p->push_if_peek_next.c; - if (c == *s) { + if (DATA_ENSURE_CHECK1 && c == *s) { STACK_PUSH_ALT(p + addr, s, sprev); INC_OP; JUMP_OUT; @@ -3810,10 +3875,7 @@ match_at(regex_t* reg, const UChar* str, const UChar* end, mem = p->repeat.id; /* mem: OP_REPEAT ID */ addr = p->repeat.addr; - STACK_ENSURE(1); - repeat_stk[mem] = GET_STACK_INDEX(stk); - STACK_PUSH_REPEAT(mem, p + 1); - + STACK_PUSH_REPEAT_INC(mem, 0); if (reg->repeat_range[mem].lower == 0) { STACK_PUSH_ALT(p + addr, s, sprev); } @@ -3824,10 +3886,7 @@ match_at(regex_t* reg, const UChar* str, const UChar* end, mem = p->repeat.id; /* mem: OP_REPEAT ID */ addr = p->repeat.addr; - STACK_ENSURE(1); - repeat_stk[mem] = GET_STACK_INDEX(stk); - STACK_PUSH_REPEAT(mem, p + 1); - + STACK_PUSH_REPEAT_INC(mem, 0); if (reg->repeat_range[mem].lower == 0) { STACK_PUSH_ALT(p + 1, s, sprev); p += addr; @@ -3838,64 +3897,42 @@ match_at(regex_t* reg, const UChar* str, const UChar* end, CASE_OP(REPEAT_INC) mem = p->repeat_inc.id; /* mem: OP_REPEAT ID */ - si = repeat_stk[mem]; - stkp = STACK_AT(si); - - repeat_inc: - stkp->u.repeat.count++; - if (stkp->u.repeat.count >= reg->repeat_range[mem].upper) { + STACK_GET_REPEAT_COUNT(mem, n); + n++; + if (n >= reg->repeat_range[mem].upper) { /* end of repeat. Nothing to do. */ INC_OP; } - else if (stkp->u.repeat.count >= reg->repeat_range[mem].lower) { + else if (n >= reg->repeat_range[mem].lower) { INC_OP; STACK_PUSH_ALT(p, s, sprev); - p = STACK_AT(si)->u.repeat.pcode; /* Don't use stkp after PUSH. */ + p = reg->repeat_range[mem].u.pcode; } else { - p = stkp->u.repeat.pcode; + p = reg->repeat_range[mem].u.pcode; } - STACK_PUSH_REPEAT_INC(si); + STACK_PUSH_REPEAT_INC(mem, n); CHECK_INTERRUPT_JUMP_OUT; - CASE_OP(REPEAT_INC_SG) - mem = p->repeat_inc.id; /* mem: OP_REPEAT ID */ - STACK_GET_REPEAT(mem, stkp); - si = GET_STACK_INDEX(stkp); - goto repeat_inc; - CASE_OP(REPEAT_INC_NG) mem = p->repeat_inc.id; /* mem: OP_REPEAT ID */ - si = repeat_stk[mem]; - stkp = STACK_AT(si); - - repeat_inc_ng: - stkp->u.repeat.count++; - if (stkp->u.repeat.count < reg->repeat_range[mem].upper) { - if (stkp->u.repeat.count >= reg->repeat_range[mem].lower) { - Operation* pcode = stkp->u.repeat.pcode; - - STACK_PUSH_REPEAT_INC(si); - STACK_PUSH_ALT(pcode, s, sprev); + STACK_GET_REPEAT_COUNT(mem, n); + n++; + STACK_PUSH_REPEAT_INC(mem, n); + if (n == reg->repeat_range[mem].upper) { + INC_OP; + } + else { + if (n >= reg->repeat_range[mem].lower) { + STACK_PUSH_ALT(reg->repeat_range[mem].u.pcode, s, sprev); INC_OP; } else { - p = stkp->u.repeat.pcode; - STACK_PUSH_REPEAT_INC(si); + p = reg->repeat_range[mem].u.pcode; } } - else if (stkp->u.repeat.count == reg->repeat_range[mem].upper) { - STACK_PUSH_REPEAT_INC(si); - INC_OP; - } CHECK_INTERRUPT_JUMP_OUT; - CASE_OP(REPEAT_INC_NG_SG) - mem = p->repeat_inc.id; /* mem: OP_REPEAT ID */ - STACK_GET_REPEAT(mem, stkp); - si = GET_STACK_INDEX(stkp); - goto repeat_inc_ng; - CASE_OP(PREC_READ_START) STACK_PUSH_PREC_READ_START(s, sprev); INC_OP; @@ -4044,14 +4081,14 @@ match_at(regex_t* reg, const UChar* str, const UChar* end, OnigCalloutFunc func; OnigCalloutArgs args; - of = ONIG_CALLOUT_OF_NAME; - name_id = p->callout_name.id; - mem = p->callout_name.num; + of = ONIG_CALLOUT_OF_NAME; + mem = p->callout_name.num; callout_common_entry: e = onig_reg_callout_list_at(reg, mem); in = e->in; if (of == ONIG_CALLOUT_OF_NAME) { + name_id = p->callout_name.id; func = onig_get_callout_start_func(reg, mem); } else { @@ -4074,7 +4111,7 @@ match_at(regex_t* reg, const UChar* str, const UChar* end, call_result = ONIGERR_INVALID_ARGUMENT; } best_len = call_result; - goto finish; + goto match_at_end; break; } } @@ -4100,7 +4137,7 @@ match_at(regex_t* reg, const UChar* str, const UChar* end, #endif CASE_OP(FINISH) - goto finish; + goto match_at_end; #ifdef ONIG_DEBUG_STATISTICS fail: @@ -4121,95 +4158,478 @@ match_at(regex_t* reg, const UChar* str, const UChar* end, JUMP_OUT; DEFAULT_OP - goto bytecode_error; + MATCH_AT_ERROR_RETURN(ONIGERR_UNDEFINED_BYTECODE); } BYTECODE_INTERPRETER_END; - finish: + match_at_end: STACK_SAVE; return best_len; +} -#ifdef ONIG_DEBUG - stack_error: - STACK_SAVE; - return ONIGERR_STACK_BUG; -#endif +typedef struct { + regex_t* reg; + OnigRegion* region; +} RR; + +struct OnigRegSetStruct { + RR* rs; + int n; + int alloc; + OnigEncoding enc; + int anchor; /* BEGIN_BUF, BEGIN_POS, (SEMI_)END_BUF */ + OnigLen anc_dmin; /* (SEMI_)END_BUF anchor distance */ + OnigLen anc_dmax; /* (SEMI_)END_BUF anchor distance */ + int all_low_high; + int anychar_inf; +}; - bytecode_error: - STACK_SAVE; - return ONIGERR_UNDEFINED_BYTECODE; +enum SearchRangeStatus { + SRS_DEAD = 0, + SRS_LOW_HIGH = 1, + SRS_ALL_RANGE = 2 +}; -#if defined(ONIG_DEBUG) && !defined(USE_DIRECT_THREADED_CODE) - unexpected_bytecode_error: - STACK_SAVE; - return ONIGERR_UNEXPECTED_BYTECODE; -#endif +typedef struct { + int state; /* value of enum SearchRangeStatus */ + UChar* low; + UChar* high; + UChar* low_prev; + UChar* sch_range; +} SearchRange; + +#define REGSET_MATCH_AND_RETURN_CHECK(upper_range) \ + r = match_at(reg, str, end, (upper_range), s, prev, msas + i); \ + if (r != ONIG_MISMATCH) {\ + if (r >= 0) {\ + goto match;\ + }\ + else goto finish; /* error */ \ + } -#ifdef USE_RETRY_LIMIT_IN_MATCH - retry_limit_in_match_over: - STACK_SAVE; - return ONIGERR_RETRY_LIMIT_IN_MATCH_OVER; -#endif -} +static inline int +regset_search_body_position_lead(OnigRegSet* set, + const UChar* str, const UChar* end, + const UChar* start, const UChar* range, /* match start range */ + const UChar* orig_range, /* data range */ + OnigOptionType option, MatchArg* msas, int* rmatch_pos) +{ + int r, n, i; + UChar *s, *prev; + UChar *low, *high, *low_prev; + UChar* sch_range; + regex_t* reg; + OnigEncoding enc; + SearchRange* sr; + n = set->n; + enc = set->enc; -static UChar* -slow_search(OnigEncoding enc, UChar* target, UChar* target_end, - const UChar* text, const UChar* text_end, UChar* text_range) -{ - UChar *t, *p, *s, *end; + s = (UChar* )start; + if (s > str) + prev = onigenc_get_prev_char_head(enc, str, s); + else + prev = (UChar* )NULL; - end = (UChar* )text_end; - end -= target_end - target - 1; - if (end > text_range) - end = text_range; + sr = (SearchRange* )xmalloc(sizeof(*sr) * n); + CHECK_NULL_RETURN_MEMERR(sr); - s = (UChar* )text; + for (i = 0; i < n; i++) { + reg = set->rs[i].reg; - while (s < end) { - if (*s == *target) { - p = s + 1; - t = target + 1; - while (t < target_end) { - if (*t != *p++) - break; - t++; + sr[i].state = SRS_DEAD; + if (reg->optimize != OPTIMIZE_NONE) { + if (reg->dist_max != INFINITE_LEN) { + if (end - range > reg->dist_max) + sch_range = (UChar* )range + reg->dist_max; + else + sch_range = (UChar* )end; + + if (forward_search(reg, str, end, s, sch_range, &low, &high, &low_prev)) { + sr[i].state = SRS_LOW_HIGH; + sr[i].low = low; + sr[i].high = high; + sr[i].low_prev = low_prev; + sr[i].sch_range = sch_range; + } + } + else { + sch_range = (UChar* )end; + if (forward_search(reg, str, end, s, sch_range, + &low, &high, (UChar** )NULL)) { + goto total_active; + } } - if (t == target_end) - return s; } - s += enclen(enc, s); + else { + total_active: + sr[i].state = SRS_ALL_RANGE; + sr[i].low = s; + sr[i].high = (UChar* )range; + sr[i].low_prev = prev; + } } - return (UChar* )NULL; -} +#define ACTIVATE_ALL_LOW_HIGH_SEARCH_THRESHOLD_LEN 500 -static int -str_lower_case_match(OnigEncoding enc, int case_fold_flag, - const UChar* t, const UChar* tend, - const UChar* p, const UChar* end) -{ - int lowlen; - UChar *q, lowbuf[ONIGENC_MBC_CASE_FOLD_MAXLEN]; + if (set->all_low_high != 0 + && range - start > ACTIVATE_ALL_LOW_HIGH_SEARCH_THRESHOLD_LEN) { + do { + int try_count = 0; + for (i = 0; i < n; i++) { + if (sr[i].state == SRS_DEAD) continue; + + if (s < sr[i].low) continue; + if (s >= sr[i].high) { + if (forward_search(set->rs[i].reg, str, end, s, sr[i].sch_range, + &low, &high, &low_prev) != 0) { + sr[i].low = low; + sr[i].high = high; + sr[i].low_prev = low_prev; + if (s < low) continue; + } + else { + sr[i].state = SRS_DEAD; + continue; + } + } - while (t < tend) { - lowlen = ONIGENC_MBC_CASE_FOLD(enc, case_fold_flag, &p, end, lowbuf); - q = lowbuf; - while (lowlen > 0) { - if (*t++ != *q++) return 0; - lowlen--; - } - } + reg = set->rs[i].reg; + REGSET_MATCH_AND_RETURN_CHECK(orig_range); + try_count++; + } /* for (i) */ - return 1; -} + if (s >= range) break; -static UChar* -slow_search_ic(OnigEncoding enc, int case_fold_flag, - UChar* target, UChar* target_end, - const UChar* text, const UChar* text_end, UChar* text_range) + if (try_count == 0) { + low = (UChar* )range; + for (i = 0; i < n; i++) { + if (sr[i].state == SRS_LOW_HIGH && low > sr[i].low) { + low = sr[i].low; + low_prev = sr[i].low_prev; + } + } + if (low == range) break; + + s = low; + prev = low_prev; + } + else { + prev = s; + s += enclen(enc, s); + } + } while (1); + } + else { + int prev_is_newline = 1; + do { + for (i = 0; i < n; i++) { + if (sr[i].state == SRS_DEAD) continue; + if (sr[i].state == SRS_LOW_HIGH) { + if (s < sr[i].low) continue; + if (s >= sr[i].high) { + if (forward_search(set->rs[i].reg, str, end, s, sr[i].sch_range, + &low, &high, &low_prev) != 0) { + sr[i].low = low; + sr[i].high = high; + /* sr[i].low_prev = low_prev; */ + if (s < low) continue; + } + else { + sr[i].state = SRS_DEAD; + continue; + } + } + } + + reg = set->rs[i].reg; + if ((reg->anchor & ANCR_ANYCHAR_INF) == 0 || prev_is_newline != 0) { + REGSET_MATCH_AND_RETURN_CHECK(orig_range); + } + } + + if (s >= range) break; + + if (set->anychar_inf != 0) + prev_is_newline = ONIGENC_IS_MBC_NEWLINE(set->enc, s, end); + + prev = s; + s += enclen(enc, s); + } while (1); + } + + xfree(sr); + return ONIG_MISMATCH; + + finish: + xfree(sr); + return r; + + match: + xfree(sr); + *rmatch_pos = (int )(s - str); + return i; +} + +static inline int +regset_search_body_regex_lead(OnigRegSet* set, + const UChar* str, const UChar* end, + const UChar* start, const UChar* orig_range, OnigRegSetLead lead, + OnigOptionType option, OnigMatchParam* mps[], int* rmatch_pos) { - UChar *s, *end; + int r; + int i; + int n; + int match_index; + const UChar* ep; + regex_t* reg; + OnigRegion* region; + + n = set->n; + + match_index = ONIG_MISMATCH; + ep = orig_range; + for (i = 0; i < n; i++) { + reg = set->rs[i].reg; + region = set->rs[i].region; + r = search_in_range(reg, str, end, start, ep, orig_range, region, option, mps[i]); + if (r > 0) { + if (str + r < ep) { + match_index = i; + *rmatch_pos = r; + if (lead == ONIG_REGSET_PRIORITY_TO_REGEX_ORDER) + break; + + ep = str + r; + } + } + else if (r == 0) { + match_index = i; + *rmatch_pos = r; + break; + } + } + + return match_index; +} + +extern int +onig_regset_search_with_param(OnigRegSet* set, + const UChar* str, const UChar* end, + const UChar* start, const UChar* range, + OnigRegSetLead lead, OnigOptionType option, OnigMatchParam* mps[], + int* rmatch_pos) +{ + int r; + int i; + UChar *s, *prev; + regex_t* reg; + OnigEncoding enc; + OnigRegion* region; + MatchArg* msas; + const UChar *orig_start = start; + const UChar *orig_range = range; + + if (set->n == 0) + return ONIG_MISMATCH; + + if (IS_POSIX_REGION(option)) + return ONIGERR_INVALID_ARGUMENT; + + r = 0; + enc = set->enc; + msas = (MatchArg* )NULL; + + for (i = 0; i < set->n; i++) { + reg = set->rs[i].reg; + region = set->rs[i].region; + ADJUST_MATCH_PARAM(reg, mps[i]); + if (IS_NOT_NULL(region)) { + r = onig_region_resize_clear(region, reg->num_mem + 1); + if (r != 0) goto finish_no_msa; + } + } + + if (start > end || start < str) goto mismatch_no_msa; + if (str < end) { + /* forward search only */ + if (range <= start) + return ONIGERR_INVALID_ARGUMENT; + } + + if (ONIG_IS_OPTION_ON(option, ONIG_OPTION_CHECK_VALIDITY_OF_STRING)) { + if (! ONIGENC_IS_VALID_MBC_STRING(enc, str, end)) { + r = ONIGERR_INVALID_WIDE_CHAR_VALUE; + goto finish_no_msa; + } + } + + if (set->anchor != OPTIMIZE_NONE && str < end) { + UChar *min_semi_end, *max_semi_end; + + if ((set->anchor & ANCR_BEGIN_POSITION) != 0) { + /* search start-position only */ + begin_position: + range = start + 1; + } + else if ((set->anchor & ANCR_BEGIN_BUF) != 0) { + /* search str-position only */ + if (start != str) goto mismatch_no_msa; + range = str + 1; + } + else if ((set->anchor & ANCR_END_BUF) != 0) { + min_semi_end = max_semi_end = (UChar* )end; + + end_buf: + if ((OnigLen )(max_semi_end - str) < set->anc_dmin) + goto mismatch_no_msa; + + if ((OnigLen )(min_semi_end - start) > set->anc_dmax) { + start = min_semi_end - set->anc_dmax; + if (start < end) + start = onigenc_get_right_adjust_char_head(enc, str, start); + } + if ((OnigLen )(max_semi_end - (range - 1)) < set->anc_dmin) { + range = max_semi_end - set->anc_dmin + 1; + } + if (start > range) goto mismatch_no_msa; + } + else if ((set->anchor & ANCR_SEMI_END_BUF) != 0) { + UChar* pre_end = ONIGENC_STEP_BACK(enc, str, end, 1); + + max_semi_end = (UChar* )end; + if (ONIGENC_IS_MBC_NEWLINE(enc, pre_end, end)) { + min_semi_end = pre_end; + +#ifdef USE_CRNL_AS_LINE_TERMINATOR + pre_end = ONIGENC_STEP_BACK(enc, str, pre_end, 1); + if (IS_NOT_NULL(pre_end) && + ONIGENC_IS_MBC_CRNL(enc, pre_end, end)) { + min_semi_end = pre_end; + } +#endif + if (min_semi_end > str && start <= min_semi_end) { + goto end_buf; + } + } + else { + min_semi_end = (UChar* )end; + goto end_buf; + } + } + else if ((set->anchor & ANCR_ANYCHAR_INF_ML) != 0) { + goto begin_position; + } + } + else if (str == end) { /* empty string */ + start = end = str; + s = (UChar* )start; + prev = (UChar* )NULL; + + msas = (MatchArg* )xmalloc(sizeof(*msas) * set->n); + CHECK_NULL_RETURN_MEMERR(msas); + for (i = 0; i < set->n; i++) { + reg = set->rs[i].reg; + MATCH_ARG_INIT(msas[i], reg, option, set->rs[i].region, start, mps[i]); + } + for (i = 0; i < set->n; i++) { + reg = set->rs[i].reg; + if (reg->threshold_len == 0) { + REGSET_MATCH_AND_RETURN_CHECK(end); + } + } + + goto mismatch; + } + + if (lead == ONIG_REGSET_POSITION_LEAD) { + msas = (MatchArg* )xmalloc(sizeof(*msas) * set->n); + CHECK_NULL_RETURN_MEMERR(msas); + + for (i = 0; i < set->n; i++) { + MATCH_ARG_INIT(msas[i], set->rs[i].reg, option, set->rs[i].region, + orig_start, mps[i]); + } + + r = regset_search_body_position_lead(set, str, end, start, range, + orig_range, option, msas, rmatch_pos); + } + else { + r = regset_search_body_regex_lead(set, str, end, start, orig_range, + lead, option, mps, rmatch_pos); + } + if (r < 0) goto finish; + else goto match2; + + mismatch: + r = ONIG_MISMATCH; + finish: + for (i = 0; i < set->n; i++) { + if (IS_NOT_NULL(msas)) + MATCH_ARG_FREE(msas[i]); + if (IS_FIND_NOT_EMPTY(set->rs[i].reg->options) && + IS_NOT_NULL(set->rs[i].region)) { + onig_region_clear(set->rs[i].region); + } + } + if (IS_NOT_NULL(msas)) xfree(msas); + return r; + + mismatch_no_msa: + r = ONIG_MISMATCH; + finish_no_msa: + return r; + + match: + *rmatch_pos = (int )(s - str); + match2: + for (i = 0; i < set->n; i++) { + if (IS_NOT_NULL(msas)) + MATCH_ARG_FREE(msas[i]); + if (IS_FIND_NOT_EMPTY(set->rs[i].reg->options) && + IS_NOT_NULL(set->rs[i].region)) { + onig_region_clear(set->rs[i].region); + } + } + if (IS_NOT_NULL(msas)) xfree(msas); + return r; /* regex index */ +} + +extern int +onig_regset_search(OnigRegSet* set, const UChar* str, const UChar* end, + const UChar* start, const UChar* range, + OnigRegSetLead lead, OnigOptionType option, int* rmatch_pos) +{ + int r; + int i; + OnigMatchParam* mp; + OnigMatchParam** mps; + + mps = (OnigMatchParam** )xmalloc((sizeof(OnigMatchParam*) + sizeof(OnigMatchParam)) * set->n); + CHECK_NULL_RETURN_MEMERR(mps); + + mp = (OnigMatchParam* )(mps + set->n); + + for (i = 0; i < set->n; i++) { + onig_initialize_match_param(mp + i); + mps[i] = mp + i; + } + + r = onig_regset_search_with_param(set, str, end, start, range, lead, option, mps, + rmatch_pos); + for (i = 0; i < set->n; i++) + onig_free_match_param_content(mp + i); + + xfree(mps); + + return r; +} + +static UChar* +slow_search(OnigEncoding enc, UChar* target, UChar* target_end, + const UChar* text, const UChar* text_end, UChar* text_range) +{ + UChar *t, *p, *s, *end; end = (UChar* )text_end; end -= target_end - target - 1; @@ -4219,6 +4639,55 @@ slow_search_ic(OnigEncoding enc, int case_fold_flag, s = (UChar* )text; while (s < end) { + if (*s == *target) { + p = s + 1; + t = target + 1; + while (t < target_end) { + if (*t != *p++) + break; + t++; + } + if (t == target_end) + return s; + } + s += enclen(enc, s); + } + + return (UChar* )NULL; +} + +static int +str_lower_case_match(OnigEncoding enc, int case_fold_flag, + const UChar* t, const UChar* tend, + const UChar* p, const UChar* end) +{ + int lowlen; + UChar *q, lowbuf[ONIGENC_MBC_CASE_FOLD_MAXLEN]; + + while (t < tend) { + if (p >= end) return 0; + lowlen = ONIGENC_MBC_CASE_FOLD(enc, case_fold_flag, &p, end, lowbuf); + q = lowbuf; + while (lowlen > 0) { + if (t >= tend) return 0; + if (*t++ != *q++) return 0; + lowlen--; + } + } + + return 1; +} + +static UChar* +slow_search_ic(OnigEncoding enc, int case_fold_flag, + UChar* target, UChar* target_end, + const UChar* text, const UChar* text_end, UChar* text_range) +{ + UChar *s; + + s = (UChar* )text; + + while (s < text_range) { if (str_lower_case_match(enc, case_fold_flag, target, target_end, s, text_end)) return s; @@ -4371,60 +4840,6 @@ sunday_quick_search(regex_t* reg, const UChar* target, const UChar* target_end, return (UChar* )NULL; } -static UChar* -sunday_quick_search_case_fold(regex_t* reg, - const UChar* target, const UChar* target_end, - const UChar* text, const UChar* text_end, - const UChar* text_range) -{ - const UChar *s, *se, *end; - const UChar *tail; - int skip, tlen1; - int map_offset; - int case_fold_flag; - OnigEncoding enc; - -#ifdef ONIG_DEBUG_SEARCH - fprintf(stderr, - "sunday_quick_search_case_fold: text: %p, text_end: %p, text_range: %p\n", text, text_end, text_range); -#endif - - enc = reg->enc; - case_fold_flag = reg->case_fold_flag; - - tail = target_end - 1; - tlen1 = (int )(tail - target); - end = text_range; - if (end + tlen1 > text_end) - end = text_end - tlen1; - - map_offset = reg->map_offset; - s = text; - - while (s < end) { - if (str_lower_case_match(enc, case_fold_flag, target, target_end, - s, text_end)) - return (UChar* )s; - - se = s + tlen1; - if (se + map_offset >= text_end) break; - skip = reg->map[*(se + map_offset)]; -#if 0 - p = s; - do { - s += enclen(enc, s); - } while ((s - p) < skip && s < end); -#else - /* This is faster than prev code for long text. ex: /(?i)Twain/ */ - s += skip; - if (s < end) - s = onigenc_get_right_adjust_char_head(enc, text, s); -#endif - } - - return (UChar* )NULL; -} - static UChar* map_search(OnigEncoding enc, UChar map[], const UChar* text, const UChar* text_range) @@ -4505,25 +4920,26 @@ onig_match_with_param(regex_t* reg, const UChar* str, const UChar* end, } static int -forward_search_range(regex_t* reg, const UChar* str, const UChar* end, UChar* s, - UChar* range, UChar** low, UChar** high, UChar** low_prev) +forward_search(regex_t* reg, const UChar* str, const UChar* end, UChar* start, + UChar* range, UChar** low, UChar** high, UChar** low_prev) { UChar *p, *pprev = (UChar* )NULL; #ifdef ONIG_DEBUG_SEARCH - fprintf(stderr, "forward_search_range: str: %p, end: %p, s: %p, range: %p\n", - str, end, s, range); + fprintf(stderr, "forward_search: str: %p, end: %p, start: %p, range: %p\n", + str, end, start, range); #endif - p = s; - if (reg->dmin > 0) { + p = start; + if (reg->dist_min != 0) { + if (end - p <= reg->dist_min) + return 0; /* fail */ + if (ONIGENC_IS_SINGLEBYTE(reg->enc)) { - p += reg->dmin; + p += reg->dist_min; } else { - UChar *q = p + reg->dmin; - - if (q >= end) return 0; /* fail */ + UChar *q = p + reg->dist_min; while (p < q) p += enclen(reg->enc, p); } } @@ -4538,11 +4954,6 @@ forward_search_range(regex_t* reg, const UChar* str, const UChar* end, UChar* s, reg->exact, reg->exact_end, p, end, range); break; - case OPTIMIZE_STR_CASE_FOLD_FAST: - p = sunday_quick_search_case_fold(reg, reg->exact, reg->exact_end, p, end, - range); - break; - case OPTIMIZE_STR_FAST: p = sunday_quick_search(reg, reg->exact, reg->exact_end, p, end, range); break; @@ -4558,7 +4969,7 @@ forward_search_range(regex_t* reg, const UChar* str, const UChar* end, UChar* s, } if (p && p < range) { - if (p - reg->dmin < s) { + if (p - start < reg->dist_min) { retry_gate: pprev = p; p += enclen(reg->enc, p); @@ -4571,8 +4982,7 @@ forward_search_range(regex_t* reg, const UChar* str, const UChar* end, UChar* s, switch (reg->sub_anchor) { case ANCR_BEGIN_LINE: if (!ON_STR_BEGIN(p)) { - prev = onigenc_get_prev_char_head(reg->enc, - (pprev ? pprev : str), p); + prev = onigenc_get_prev_char_head(reg->enc, (pprev ? pprev : str), p); if (!ONIGENC_IS_MBC_NEWLINE(reg->enc, prev, end)) goto retry_gate; } @@ -4593,35 +5003,34 @@ forward_search_range(regex_t* reg, const UChar* str, const UChar* end, UChar* s, #endif ) goto retry_gate; + break; } } - if (reg->dmax == 0) { + if (reg->dist_max == 0) { *low = p; if (low_prev) { - if (*low > s) - *low_prev = onigenc_get_prev_char_head(reg->enc, s, p); + if (*low > start) + *low_prev = onigenc_get_prev_char_head(reg->enc, start, p); else *low_prev = onigenc_get_prev_char_head(reg->enc, (pprev ? pprev : str), p); } + *high = p; } else { - if (reg->dmax != INFINITE_LEN) { - if (p - str < reg->dmax) { + if (reg->dist_max != INFINITE_LEN) { + if (p - str < reg->dist_max) { *low = (UChar* )str; if (low_prev) *low_prev = onigenc_get_prev_char_head(reg->enc, str, *low); } else { - *low = p - reg->dmax; - if (*low > s) { - *low = onigenc_get_right_adjust_char_head_with_prev(reg->enc, s, + *low = p - reg->dist_max; + if (*low > start) { + *low = onigenc_get_right_adjust_char_head_with_prev(reg->enc, start, *low, (const UChar** )low_prev); - if (low_prev && IS_NULL(*low_prev)) - *low_prev = onigenc_get_prev_char_head(reg->enc, - (pprev ? pprev : s), *low); } else { if (low_prev) @@ -4630,14 +5039,18 @@ forward_search_range(regex_t* reg, const UChar* str, const UChar* end, UChar* s, } } } + /* no needs to adjust *high, *high is used as range check only */ + if (p - str < reg->dist_min) + *high = (UChar* )str; + else + *high = p - reg->dist_min; } - /* no needs to adjust *high, *high is used as range check only */ - *high = p - reg->dmin; #ifdef ONIG_DEBUG_SEARCH fprintf(stderr, - "forward_search_range success: low: %d, high: %d, dmin: %d, dmax: %d\n", - (int )(*low - str), (int )(*high - str), reg->dmin, reg->dmax); + "forward_search success: low: %d, high: %d, dmin: %u, dmax: %u\n", + (int )(*low - str), (int )(*high - str), + reg->dist_min, reg->dist_max); #endif return 1; /* success */ } @@ -4647,15 +5060,11 @@ forward_search_range(regex_t* reg, const UChar* str, const UChar* end, UChar* s, static int -backward_search_range(regex_t* reg, const UChar* str, const UChar* end, - UChar* s, const UChar* range, UChar* adjrange, - UChar** low, UChar** high) +backward_search(regex_t* reg, const UChar* str, const UChar* end, UChar* s, + const UChar* range, UChar* adjrange, UChar** low, UChar** high) { UChar *p; - if (range == 0) goto fail; - - range += reg->dmin; p = s; retry: @@ -4667,7 +5076,6 @@ backward_search_range(regex_t* reg, const UChar* str, const UChar* end, break; case OPTIMIZE_STR_CASE_FOLD: - case OPTIMIZE_STR_CASE_FOLD_FAST: p = slow_search_backward_ic(reg->enc, reg->case_fold_flag, reg->exact, reg->exact_end, range, adjrange, end, p); @@ -4722,15 +5130,27 @@ backward_search_range(regex_t* reg, const UChar* str, const UChar* end, } } - /* no needs to adjust *high, *high is used as range check only */ - if (reg->dmax != INFINITE_LEN) { - *low = p - reg->dmax; - *high = p - reg->dmin; + if (reg->dist_max != INFINITE_LEN) { + if (p - str < reg->dist_max) + *low = (UChar* )str; + else + *low = p - reg->dist_max; + + if (reg->dist_min != 0) { + if (p - str < reg->dist_min) + *high = (UChar* )str; + else + *high = p - reg->dist_min; + } + else { + *high = p; + } + *high = onigenc_get_right_adjust_char_head(reg->enc, adjrange, *high); } #ifdef ONIG_DEBUG_SEARCH - fprintf(stderr, "backward_search_range: low: %d, high: %d\n", + fprintf(stderr, "backward_search: low: %d, high: %d\n", (int )(*low - str), (int )(*high - str)); #endif return 1; /* success */ @@ -4738,7 +5158,7 @@ backward_search_range(regex_t* reg, const UChar* str, const UChar* end, fail: #ifdef ONIG_DEBUG_SEARCH - fprintf(stderr, "backward_search_range: fail.\n"); + fprintf(stderr, "backward_search: fail.\n"); #endif return 0; /* fail */ } @@ -4751,24 +5171,35 @@ onig_search(regex_t* reg, const UChar* str, const UChar* end, { int r; OnigMatchParam mp; + const UChar* data_range; onig_initialize_match_param(&mp); - r = onig_search_with_param(reg, str, end, start, range, region, option, &mp); + + /* The following is an expanded code of onig_search_with_param() */ + if (range > start) + data_range = range; + else + data_range = end; + + r = search_in_range(reg, str, end, start, range, data_range, region, + option, &mp); + onig_free_match_param_content(&mp); return r; } -extern int -onig_search_with_param(regex_t* reg, const UChar* str, const UChar* end, - const UChar* start, const UChar* range, OnigRegion* region, - OnigOptionType option, OnigMatchParam* mp) +static int +search_in_range(regex_t* reg, const UChar* str, const UChar* end, + const UChar* start, const UChar* range, /* match start range */ + const UChar* data_range, /* subject string range */ + OnigRegion* region, + OnigOptionType option, OnigMatchParam* mp) { int r; UChar *s, *prev; MatchArg msa; const UChar *orig_start = start; - const UChar *orig_range = range; #ifdef ONIG_DEBUG_SEARCH fprintf(stderr, @@ -4851,17 +5282,21 @@ onig_search_with_param(regex_t* reg, const UChar* str, const UChar* end, min_semi_end = max_semi_end = (UChar* )end; end_buf: - if ((OnigLen )(max_semi_end - str) < reg->anchor_dmin) + if ((OnigLen )(max_semi_end - str) < reg->anc_dist_min) goto mismatch_no_msa; if (range > start) { - if ((OnigLen )(min_semi_end - start) > reg->anchor_dmax) { - start = min_semi_end - reg->anchor_dmax; + if (reg->anc_dist_max != INFINITE_LEN && + min_semi_end - start > reg->anc_dist_max) { + start = min_semi_end - reg->anc_dist_max; if (start < end) start = onigenc_get_right_adjust_char_head(reg->enc, str, start); } - if ((OnigLen )(max_semi_end - (range - 1)) < reg->anchor_dmin) { - range = max_semi_end - reg->anchor_dmin + 1; + if (max_semi_end - (range - 1) < reg->anc_dist_min) { + if (max_semi_end - str + 1 < reg->anc_dist_min) + goto mismatch_no_msa; + else + range = max_semi_end - reg->anc_dist_min + 1; } if (start > range) goto mismatch_no_msa; @@ -4869,12 +5304,17 @@ onig_search_with_param(regex_t* reg, const UChar* str, const UChar* end, Backward search is used. */ } else { - if ((OnigLen )(min_semi_end - range) > reg->anchor_dmax) { - range = min_semi_end - reg->anchor_dmax; + if (reg->anc_dist_max != INFINITE_LEN && + min_semi_end - range > reg->anc_dist_max) { + range = min_semi_end - reg->anc_dist_max; } - if ((OnigLen )(max_semi_end - start) < reg->anchor_dmin) { - start = max_semi_end - reg->anchor_dmin; - start = ONIGENC_LEFT_ADJUST_CHAR_HEAD(reg->enc, str, start); + if (max_semi_end - start < reg->anc_dist_min) { + if (max_semi_end - str < reg->anc_dist_min) + goto mismatch_no_msa; + else { + start = max_semi_end - reg->anc_dist_min; + start = ONIGENC_LEFT_ADJUST_CHAR_HEAD(reg->enc, str, start); + } } if (range > start) goto mismatch_no_msa; } @@ -4942,29 +5382,33 @@ onig_search_with_param(regex_t* reg, const UChar* str, const UChar* end, if (reg->optimize != OPTIMIZE_NONE) { UChar *sch_range, *low, *high, *low_prev; - sch_range = (UChar* )range; - if (reg->dmax != 0) { - if (reg->dmax == INFINITE_LEN) + if (reg->dist_max != 0) { + if (reg->dist_max == INFINITE_LEN) sch_range = (UChar* )end; else { - sch_range += reg->dmax; - if (sch_range > end) sch_range = (UChar* )end; + if ((end - range) < reg->dist_max) + sch_range = (UChar* )end; + else { + sch_range = (UChar* )range + reg->dist_max; + } } } + else + sch_range = (UChar* )range; if ((end - start) < reg->threshold_len) goto mismatch; - if (reg->dmax != INFINITE_LEN) { + if (reg->dist_max != INFINITE_LEN) { do { - if (! forward_search_range(reg, str, end, s, sch_range, - &low, &high, &low_prev)) goto mismatch; + if (! forward_search(reg, str, end, s, sch_range, &low, &high, + &low_prev)) goto mismatch; if (s < low) { s = low; prev = low_prev; } while (s <= high) { - MATCH_AND_RETURN_CHECK(orig_range); + MATCH_AND_RETURN_CHECK(data_range); prev = s; s += enclen(reg->enc, s); } @@ -4972,12 +5416,12 @@ onig_search_with_param(regex_t* reg, const UChar* str, const UChar* end, goto mismatch; } else { /* check only. */ - if (! forward_search_range(reg, str, end, s, sch_range, - &low, &high, (UChar** )NULL)) goto mismatch; + if (! forward_search(reg, str, end, s, sch_range, &low, &high, + (UChar** )NULL)) goto mismatch; if ((reg->anchor & ANCR_ANYCHAR_INF) != 0) { do { - MATCH_AND_RETURN_CHECK(orig_range); + MATCH_AND_RETURN_CHECK(data_range); prev = s; s += enclen(reg->enc, s); @@ -4994,13 +5438,13 @@ onig_search_with_param(regex_t* reg, const UChar* str, const UChar* end, } do { - MATCH_AND_RETURN_CHECK(orig_range); + MATCH_AND_RETURN_CHECK(data_range); prev = s; s += enclen(reg->enc, s); } while (s < range); if (s == range) { /* because empty match with /$/. */ - MATCH_AND_RETURN_CHECK(orig_range); + MATCH_AND_RETURN_CHECK(data_range); } } else { /* backward search */ @@ -5011,19 +5455,30 @@ onig_search_with_param(regex_t* reg, const UChar* str, const UChar* end, if (reg->optimize != OPTIMIZE_NONE) { UChar *low, *high, *adjrange, *sch_start; + const UChar *min_range; + + if ((end - range) < reg->threshold_len) goto mismatch; if (range < end) adjrange = ONIGENC_LEFT_ADJUST_CHAR_HEAD(reg->enc, str, range); else adjrange = (UChar* )end; - if (reg->dmax != INFINITE_LEN && - (end - range) >= reg->threshold_len) { + if (end - range > reg->dist_min) + min_range = range + reg->dist_min; + else + min_range = end; + + if (reg->dist_max != INFINITE_LEN) { do { - sch_start = s + reg->dmax; - if (sch_start > end) sch_start = (UChar* )end; - if (backward_search_range(reg, str, end, sch_start, range, adjrange, - &low, &high) <= 0) + if (end - s > reg->dist_max) + sch_start = s + reg->dist_max; + else { + sch_start = onigenc_get_prev_char_head(reg->enc, str, end); + } + + if (backward_search(reg, str, end, sch_start, min_range, adjrange, + &low, &high) <= 0) goto mismatch; if (s > high) @@ -5038,22 +5493,10 @@ onig_search_with_param(regex_t* reg, const UChar* str, const UChar* end, goto mismatch; } else { /* check only. */ - if ((end - range) < reg->threshold_len) goto mismatch; + sch_start = onigenc_get_prev_char_head(reg->enc, str, end); - sch_start = s; - if (reg->dmax != 0) { - if (reg->dmax == INFINITE_LEN) - sch_start = (UChar* )end; - else { - sch_start += reg->dmax; - if (sch_start > end) sch_start = (UChar* )end; - else - sch_start = ONIGENC_LEFT_ADJUST_CHAR_HEAD(reg->enc, - start, sch_start); - } - } - if (backward_search_range(reg, str, end, sch_start, range, adjrange, - &low, &high) <= 0) goto mismatch; + if (backward_search(reg, str, end, sch_start, min_range, adjrange, + &low, &high) <= 0) goto mismatch; } } @@ -5108,6 +5551,22 @@ onig_search_with_param(regex_t* reg, const UChar* str, const UChar* end, return (int )(s - str); } +extern int +onig_search_with_param(regex_t* reg, const UChar* str, const UChar* end, + const UChar* start, const UChar* range, OnigRegion* region, + OnigOptionType option, OnigMatchParam* mp) +{ + const UChar* data_range; + + if (range > start) + data_range = range; + else + data_range = end; + + return search_in_range(reg, str, end, start, range, data_range, region, + option, mp); +} + extern int onig_scan(regex_t* reg, const UChar* str, const UChar* end, OnigRegion* region, OnigOptionType option, @@ -5210,6 +5669,202 @@ onig_copy_encoding(OnigEncoding to, OnigEncoding from) *to = *from; } +extern int +onig_regset_new(OnigRegSet** rset, int n, regex_t* regs[]) +{ +#define REGSET_INITIAL_ALLOC_SIZE 10 + + int i; + int r; + int alloc; + OnigRegSet* set; + RR* rs; + + *rset = 0; + + set = (OnigRegSet* )xmalloc(sizeof(*set)); + CHECK_NULL_RETURN_MEMERR(set); + + alloc = n > REGSET_INITIAL_ALLOC_SIZE ? n : REGSET_INITIAL_ALLOC_SIZE; + rs = (RR* )xmalloc(sizeof(set->rs[0]) * alloc); + if (IS_NULL(rs)) { + xfree(set); + return ONIGERR_MEMORY; + } + + set->rs = rs; + set->n = 0; + set->alloc = alloc; + + for (i = 0; i < n; i++) { + regex_t* reg = regs[i]; + + r = onig_regset_add(set, reg); + if (r != 0) { + for (i = 0; i < set->n; i++) { + OnigRegion* region = set->rs[i].region; + if (IS_NOT_NULL(region)) + onig_region_free(region, 1); + } + xfree(set->rs); + xfree(set); + return r; + } + } + + *rset = set; + return 0; +} + +static void +update_regset_by_reg(OnigRegSet* set, regex_t* reg) +{ + if (set->n == 1) { + set->enc = reg->enc; + set->anchor = reg->anchor; + set->anc_dmin = reg->anc_dist_min; + set->anc_dmax = reg->anc_dist_max; + set->all_low_high = + (reg->optimize == OPTIMIZE_NONE || reg->dist_max == INFINITE_LEN) ? 0 : 1; + set->anychar_inf = (reg->anchor & ANCR_ANYCHAR_INF) != 0 ? 1 : 0; + } + else { + int anchor; + + anchor = set->anchor & reg->anchor; + if (anchor != 0) { + OnigLen anc_dmin; + OnigLen anc_dmax; + + anc_dmin = set->anc_dmin; + anc_dmax = set->anc_dmax; + if (anc_dmin > reg->anc_dist_min) anc_dmin = reg->anc_dist_min; + if (anc_dmax < reg->anc_dist_max) anc_dmax = reg->anc_dist_max; + set->anc_dmin = anc_dmin; + set->anc_dmax = anc_dmax; + } + + set->anchor = anchor; + + if (reg->optimize == OPTIMIZE_NONE || reg->dist_max == INFINITE_LEN) + set->all_low_high = 0; + + if ((reg->anchor & ANCR_ANYCHAR_INF) != 0) + set->anychar_inf = 1; + } +} + +extern int +onig_regset_add(OnigRegSet* set, regex_t* reg) +{ + OnigRegion* region; + + if (IS_FIND_LONGEST(reg->options)) + return ONIGERR_INVALID_ARGUMENT; + + if (set->n != 0 && reg->enc != set->enc) + return ONIGERR_INVALID_ARGUMENT; + + if (set->n >= set->alloc) { + RR* nrs; + int new_alloc; + + new_alloc = set->alloc * 2; + nrs = (RR* )xrealloc(set->rs, sizeof(set->rs[0]) * new_alloc); + CHECK_NULL_RETURN_MEMERR(nrs); + + set->rs = nrs; + set->alloc = new_alloc; + } + + region = onig_region_new(); + CHECK_NULL_RETURN_MEMERR(region); + + set->rs[set->n].reg = reg; + set->rs[set->n].region = region; + set->n++; + + update_regset_by_reg(set, reg); + return 0; +} + +extern int +onig_regset_replace(OnigRegSet* set, int at, regex_t* reg) +{ + int i; + + if (at < 0 || at >= set->n) + return ONIGERR_INVALID_ARGUMENT; + + if (IS_NULL(reg)) { + onig_region_free(set->rs[at].region, 1); + for (i = at; i < set->n - 1; i++) { + set->rs[i].reg = set->rs[i+1].reg; + set->rs[i].region = set->rs[i+1].region; + } + set->n--; + } + else { + if (IS_FIND_LONGEST(reg->options)) + return ONIGERR_INVALID_ARGUMENT; + + if (set->n > 1 && reg->enc != set->enc) + return ONIGERR_INVALID_ARGUMENT; + + set->rs[at].reg = reg; + } + + for (i = 0; i < set->n; i++) + update_regset_by_reg(set, set->rs[i].reg); + + return 0; +} + +extern void +onig_regset_free(OnigRegSet* set) +{ + int i; + + for (i = 0; i < set->n; i++) { + regex_t* reg; + OnigRegion* region; + + reg = set->rs[i].reg; + region = set->rs[i].region; + onig_free(reg); + if (IS_NOT_NULL(region)) + onig_region_free(region, 1); + } + + xfree(set->rs); + xfree(set); +} + +extern int +onig_regset_number_of_regex(OnigRegSet* set) +{ + return set->n; +} + +extern regex_t* +onig_regset_get_regex(OnigRegSet* set, int at) +{ + if (at < 0 || at >= set->n) + return (regex_t* )0; + + return set->rs[at].reg; +} + +extern OnigRegion* +onig_regset_get_region(OnigRegSet* set, int at) +{ + if (at < 0 || at >= set->n) + return (OnigRegion* )0; + + return set->rs[at].region; +} + + #ifdef USE_DIRECT_THREADED_CODE extern int onig_init_for_match_at(regex_t* reg) @@ -5402,35 +6057,25 @@ onig_get_capture_range_in_callout(OnigCalloutArgs* a, int mem_num, int* begin, i const UChar* str; StackType* stk_base; int i; + StackIndex* mem_start_stk; + StackIndex* mem_end_stk; i = mem_num; reg = a->regex; str = a->string; stk_base = a->stk_base; + mem_start_stk = a->mem_start_stk; + mem_end_stk = a->mem_end_stk; if (i > 0) { if (a->mem_end_stk[i] != INVALID_STACK_INDEX) { - if (MEM_STATUS_AT(reg->bt_mem_start, i)) - *begin = (int )(STACK_AT(a->mem_start_stk[i])->u.mem.pstr - str); - else - *begin = (int )((UChar* )((void* )a->mem_start_stk[i]) - str); - - *end = (int )((MEM_STATUS_AT(reg->bt_mem_end, i) - ? STACK_AT(a->mem_end_stk[i])->u.mem.pstr - : (UChar* )((void* )a->mem_end_stk[i])) - str); + *begin = (int )(STACK_MEM_START(reg, i) - str); + *end = (int )(STACK_MEM_END(reg, i) - str); } else { *begin = *end = ONIG_REGION_NOTPOS; } } - else if (i == 0) { -#if 0 - *begin = a->start - str; - *end = a->current - str; -#else - return ONIGERR_INVALID_ARGUMENT; -#endif - } else return ONIGERR_INVALID_ARGUMENT; @@ -5468,14 +6113,6 @@ onig_builtin_mismatch(OnigCalloutArgs* args ARG_UNUSED, void* user_data ARG_UNUS return ONIG_MISMATCH; } -#if 0 -extern int -onig_builtin_success(OnigCalloutArgs* args ARG_UNUSED, void* user_data ARG_UNUSED) -{ - return ONIG_CALLOUT_SUCCESS; -} -#endif - extern int onig_builtin_error(OnigCalloutArgs* args, void* user_data ARG_UNUSED) { diff --git a/src/regext.c b/src/regext.c index 965c793..c46f630 100644 --- a/src/regext.c +++ b/src/regext.c @@ -2,7 +2,7 @@ regext.c - Oniguruma (regular expression library) **********************************************************************/ /*- - * Copyright (c) 2002-2019 K.Kosako + * Copyright (c) 2002-2019 K.Kosako * All rights reserved. * * Redistribution and use in source and binary forms, with or without diff --git a/src/reggnu.c b/src/reggnu.c index a124ae8..8a45078 100644 --- a/src/reggnu.c +++ b/src/reggnu.c @@ -2,7 +2,7 @@ reggnu.c - Oniguruma (regular expression library) **********************************************************************/ /*- - * Copyright (c) 2002-2019 K.Kosako + * Copyright (c) 2002-2019 K.Kosako * All rights reserved. * * Redistribution and use in source and binary forms, with or without diff --git a/src/regint.h b/src/regint.h index 38389a1..cc540da 100644 --- a/src/regint.h +++ b/src/regint.h @@ -4,7 +4,7 @@ regint.h - Oniguruma (regular expression library) **********************************************************************/ /*- - * Copyright (c) 2002-2019 K.Kosako + * Copyright (c) 2002-2019 K.Kosako * All rights reserved. * * Redistribution and use in source and binary forms, with or without @@ -47,16 +47,11 @@ #endif #endif -#if defined(__i386) || defined(__i386__) || defined(_M_IX86) || \ - (defined(__ppc__) && defined(__APPLE__)) || \ - defined(__x86_64) || defined(__x86_64__) || \ - defined(__mc68020__) -#define PLATFORM_UNALIGNED_WORD_ACCESS -#endif - +#ifndef ONIG_DISABLE_DIRECT_THREADING #ifdef __GNUC__ #define USE_GOTO_LABELS_AS_VALUES #endif +#endif /* config */ /* spec. config */ @@ -82,6 +77,8 @@ #define USE_VARIABLE_META_CHARS #define USE_POSIX_API_REGION_OPTION #define USE_FIND_LONGEST_SEARCH_ALL_OF_RANGE +/* #define USE_REPEAT_AND_EMPTY_CHECK_LOCAL_VAR */ + #include "regenc.h" @@ -197,49 +194,16 @@ typedef unsigned int uintptr_t; #define CHAR_MAP_SIZE 256 #define INFINITE_LEN ONIG_INFINITE_DISTANCE -#ifdef PLATFORM_UNALIGNED_WORD_ACCESS - -#define PLATFORM_GET_INC(val,p,type) do{\ - val = *(type* )p;\ - (p) += sizeof(type);\ -} while(0) - -#else - -#define PLATFORM_GET_INC(val,p,type) do{\ - xmemcpy(&val, (p), sizeof(type));\ - (p) += sizeof(type);\ -} while(0) - -/* sizeof(OnigCodePoint) */ -#ifdef SIZEOF_SIZE_T -# define WORD_ALIGNMENT_SIZE SIZEOF_SIZE_T -#else -# define WORD_ALIGNMENT_SIZE SIZEOF_LONG -#endif - -#define GET_ALIGNMENT_PAD_SIZE(addr,pad_size) do {\ - (pad_size) = WORD_ALIGNMENT_SIZE - ((uintptr_t )(addr) % WORD_ALIGNMENT_SIZE);\ - if ((pad_size) == WORD_ALIGNMENT_SIZE) (pad_size) = 0;\ -} while (0) - -#define ALIGNMENT_RIGHT(addr) do {\ - (addr) += (WORD_ALIGNMENT_SIZE - 1);\ - (addr) -= ((uintptr_t )(addr) % WORD_ALIGNMENT_SIZE);\ -} while (0) - -#endif /* PLATFORM_UNALIGNED_WORD_ACCESS */ - #ifdef USE_CALLOUT typedef struct { - int flag; - OnigCalloutOf of; - int in; - int name_id; - const UChar* tag_start; - const UChar* tag_end; + int flag; + OnigCalloutOf of; + int in; + int name_id; + const UChar* tag_start; + const UChar* tag_end; OnigCalloutType type; OnigCalloutFunc start_func; OnigCalloutFunc end_func; @@ -272,7 +236,6 @@ enum OptimizeType { OPTIMIZE_STR, /* Slow Search */ OPTIMIZE_STR_FAST, /* Sunday quick search / BMH */ OPTIMIZE_STR_FAST_STEP_FORWARD, /* Sunday quick search / BMH */ - OPTIMIZE_STR_CASE_FOLD_FAST, /* Sunday quick search / BMH (ignore case) */ OPTIMIZE_STR_CASE_FOLD, /* Slow Search (ignore case) */ OPTIMIZE_MAP /* char map */ }; @@ -288,6 +251,8 @@ typedef unsigned int MemStatusType; #define MEM_STATUS_AT0(stats,n) \ ((n) > 0 && (n) < (int )MEM_STATUS_BITS_NUM ? ((stats) & ((MemStatusType )1 << n)) : ((stats) & 1)) +#define MEM_STATUS_IS_ALL_ON(stats) (((stats) & 1) != 0) + #define MEM_STATUS_ON(stats,n) do {\ if ((n) < (int )MEM_STATUS_BITS_NUM) {\ if ((n) != 0)\ @@ -302,8 +267,14 @@ typedef unsigned int MemStatusType; (stats) |= ((MemStatusType )1 << (n));\ } while (0) +#define MEM_STATUS_LIMIT_AT(stats,n) \ + ((n) < (int )MEM_STATUS_BITS_NUM ? ((stats) & ((MemStatusType )1 << n)) : 0) +#define MEM_STATUS_LIMIT_ON(stats,n) do {\ + if ((n) < (int )MEM_STATUS_BITS_NUM && (n) != 0) {\ + (stats) |= ((MemStatusType )1 << (n));\ + }\ +} while (0) -#define INT_MAX_LIMIT ((1UL << (SIZEOF_INT * 8 - 1)) - 1) #define IS_CODE_WORD_ASCII(enc,code) \ (ONIGENC_IS_CODE_ASCII(code) && ONIGENC_IS_CODE_WORD(enc,code)) @@ -354,16 +325,12 @@ typedef unsigned int MemStatusType; /* bitset */ #define BITS_PER_BYTE 8 #define SINGLE_BYTE_SIZE (1 << BITS_PER_BYTE) -#define BITS_IN_ROOM (sizeof(Bits) * BITS_PER_BYTE) +#define BITS_IN_ROOM 32 /* 4 * BITS_PER_BYTE */ #define BITSET_SIZE (SINGLE_BYTE_SIZE / BITS_IN_ROOM) -#ifdef PLATFORM_UNALIGNED_WORD_ACCESS -typedef unsigned int Bits; -#else -typedef unsigned char Bits; -#endif -typedef Bits BitSet[BITSET_SIZE]; -typedef Bits* BitSetRef; +typedef uint32_t Bits; +typedef Bits BitSet[BITSET_SIZE]; +typedef Bits* BitSetRef; #define SIZE_BITSET sizeof(BitSet) @@ -372,8 +339,8 @@ typedef Bits* BitSetRef; for (i = 0; i < (int )BITSET_SIZE; i++) { (bs)[i] = 0; } \ } while (0) -#define BS_ROOM(bs,pos) (bs)[pos / BITS_IN_ROOM] -#define BS_BIT(pos) (1 << (pos % BITS_IN_ROOM)) +#define BS_ROOM(bs,pos) (bs)[(unsigned int )(pos) >> 5] +#define BS_BIT(pos) (1u << ((unsigned int )(pos) & 0x1f)) #define BITSET_AT(bs, pos) (BS_ROOM(bs,pos) & BS_BIT(pos)) #define BITSET_SET_BIT(bs, pos) BS_ROOM(bs,pos) |= BS_BIT(pos) @@ -389,11 +356,13 @@ typedef struct _BBuf { #define BB_INIT(buf,size) bbuf_init((BBuf* )(buf), (size)) +/* #define BB_SIZE_INC(buf,inc) do{\ (buf)->alloc += (inc);\ (buf)->p = (UChar* )xrealloc((buf)->p, (buf)->alloc);\ if (IS_NULL((buf)->p)) return(ONIGERR_MEMORY);\ } while (0) +*/ #define BB_EXPAND(buf,low) do{\ do { (buf)->alloc *= 2; } while ((buf)->alloc < (unsigned int )low);\ @@ -491,39 +460,34 @@ typedef struct _BBuf { /* operation code */ enum OpCode { - OP_FINISH = 0, /* matching process terminator (no more alternative) */ - OP_END = 1, /* pattern code terminator (success end) */ - - OP_EXACT1 = 2, /* single byte, N = 1 */ - OP_EXACT2, /* single byte, N = 2 */ - OP_EXACT3, /* single byte, N = 3 */ - OP_EXACT4, /* single byte, N = 4 */ - OP_EXACT5, /* single byte, N = 5 */ - OP_EXACTN, /* single byte */ - OP_EXACTMB2N1, /* mb-length = 2 N = 1 */ - OP_EXACTMB2N2, /* mb-length = 2 N = 2 */ - OP_EXACTMB2N3, /* mb-length = 2 N = 3 */ - OP_EXACTMB2N, /* mb-length = 2 */ - OP_EXACTMB3N, /* mb-length = 3 */ - OP_EXACTMBN, /* other length */ - - OP_EXACT1_IC, /* single byte, N = 1, ignore case */ - OP_EXACTN_IC, /* single byte, ignore case */ - + OP_FINISH = 0, /* matching process terminator (no more alternative) */ + OP_END = 1, /* pattern code terminator (success end) */ + OP_STR_1 = 2, /* single byte, N = 1 */ + OP_STR_2, /* single byte, N = 2 */ + OP_STR_3, /* single byte, N = 3 */ + OP_STR_4, /* single byte, N = 4 */ + OP_STR_5, /* single byte, N = 5 */ + OP_STR_N, /* single byte */ + OP_STR_MB2N1, /* mb-length = 2 N = 1 */ + OP_STR_MB2N2, /* mb-length = 2 N = 2 */ + OP_STR_MB2N3, /* mb-length = 2 N = 3 */ + OP_STR_MB2N, /* mb-length = 2 */ + OP_STR_MB3N, /* mb-length = 3 */ + OP_STR_MBN, /* other length */ + OP_STR_1_IC, /* single byte, N = 1, ignore case */ + OP_STR_N_IC, /* single byte, ignore case */ OP_CCLASS, OP_CCLASS_MB, OP_CCLASS_MIX, OP_CCLASS_NOT, OP_CCLASS_MB_NOT, OP_CCLASS_MIX_NOT, - OP_ANYCHAR, /* "." */ OP_ANYCHAR_ML, /* "." multi-line */ OP_ANYCHAR_STAR, /* ".*" */ OP_ANYCHAR_ML_STAR, /* ".*" multi-line */ OP_ANYCHAR_STAR_PEEK_NEXT, OP_ANYCHAR_ML_STAR_PEEK_NEXT, - OP_WORD, OP_WORD_ASCII, OP_NO_WORD, @@ -532,16 +496,13 @@ enum OpCode { OP_NO_WORD_BOUNDARY, OP_WORD_BEGIN, OP_WORD_END, - OP_TEXT_SEGMENT_BOUNDARY, - OP_BEGIN_BUF, OP_END_BUF, OP_BEGIN_LINE, OP_END_LINE, OP_SEMI_END_BUF, OP_BEGIN_POSITION, - OP_BACKREF1, OP_BACKREF2, OP_BACKREF_N, @@ -552,34 +513,35 @@ enum OpCode { OP_BACKREF_WITH_LEVEL_IC, /* \k, \k */ OP_BACKREF_CHECK, /* (?(n)), (?('name')) */ OP_BACKREF_CHECK_WITH_LEVEL, /* (?(n-level)), (?('name-level')) */ - - OP_MEMORY_START, - OP_MEMORY_START_PUSH, /* push back-tracker to stack */ - OP_MEMORY_END_PUSH, /* push back-tracker to stack */ - OP_MEMORY_END_PUSH_REC, /* push back-tracker to stack */ - OP_MEMORY_END, - OP_MEMORY_END_REC, /* push marker to stack */ - + OP_MEM_START, + OP_MEM_START_PUSH, /* push back-tracker to stack */ + OP_MEM_END_PUSH, /* push back-tracker to stack */ +#ifdef USE_CALL + OP_MEM_END_PUSH_REC, /* push back-tracker to stack */ +#endif + OP_MEM_END, +#ifdef USE_CALL + OP_MEM_END_REC, /* push marker to stack */ +#endif OP_FAIL, /* pop stack and move */ OP_JUMP, OP_PUSH, OP_PUSH_SUPER, OP_POP_OUT, #ifdef USE_OP_PUSH_OR_JUMP_EXACT - OP_PUSH_OR_JUMP_EXACT1, /* if match exact then push, else jump. */ + OP_PUSH_OR_JUMP_EXACT1, /* if match exact then push, else jump. */ #endif - OP_PUSH_IF_PEEK_NEXT, /* if match exact then push, else none. */ - OP_REPEAT, /* {n,m} */ - OP_REPEAT_NG, /* {n,m}? (non greedy) */ + OP_PUSH_IF_PEEK_NEXT, /* if match exact then push, else none. */ + OP_REPEAT, /* {n,m} */ + OP_REPEAT_NG, /* {n,m}? (non greedy) */ OP_REPEAT_INC, - OP_REPEAT_INC_NG, /* non greedy */ - OP_REPEAT_INC_SG, /* search and get in stack */ - OP_REPEAT_INC_NG_SG, /* search and get in stack (non greedy) */ + OP_REPEAT_INC_NG, /* non greedy */ OP_EMPTY_CHECK_START, /* null loop checker start */ OP_EMPTY_CHECK_END, /* null loop checker end */ OP_EMPTY_CHECK_END_MEMST, /* null loop checker end (with capture status) */ +#ifdef USE_CALL OP_EMPTY_CHECK_END_MEMST_PUSH, /* with capture status and push check-end */ - +#endif OP_PREC_READ_START, /* (?=...) start */ OP_PREC_READ_END, /* (?=...) end */ OP_PREC_READ_NOT_START, /* (?!...) start */ @@ -589,11 +551,12 @@ enum OpCode { OP_LOOK_BEHIND, /* (?<=...) start (no needs end opcode) */ OP_LOOK_BEHIND_NOT_START, /* (? */ - OP_RETURN, OP_PUSH_SAVE_VAL, OP_UPDATE_VAR, +#ifdef USE_CALL + OP_CALL, /* \g */ + OP_RETURN, +#endif #ifdef USE_CALLOUT OP_CALLOUT_CONTENTS, /* (?{...}) (?{{...}}) */ OP_CALLOUT_NAME, /* (*name) (*name[tag](args...)) */ @@ -601,8 +564,8 @@ enum OpCode { }; enum SaveType { - SAVE_KEEP = 0, /* SAVE S */ - SAVE_S = 1, + SAVE_KEEP = 0, /* SAVE S */ + SAVE_S = 1, SAVE_RIGHT_RANGE = 2, }; @@ -642,116 +605,57 @@ typedef int ModeType; #define SIZE_UPDATE_VAR_TYPE sizeof(UpdateVarType) #define SIZE_MODE sizeof(ModeType) -#define GET_RELADDR_INC(addr,p) PLATFORM_GET_INC(addr, p, RelAddrType) -#define GET_ABSADDR_INC(addr,p) PLATFORM_GET_INC(addr, p, AbsAddrType) -#define GET_LENGTH_INC(len,p) PLATFORM_GET_INC(len, p, LengthType) -#define GET_MEMNUM_INC(num,p) PLATFORM_GET_INC(num, p, MemNumType) -#define GET_REPEATNUM_INC(num,p) PLATFORM_GET_INC(num, p, RepeatNumType) -#define GET_OPTION_INC(option,p) PLATFORM_GET_INC(option, p, OnigOptionType) -#define GET_POINTER_INC(ptr,p) PLATFORM_GET_INC(ptr, p, PointerType) -#define GET_SAVE_TYPE_INC(type,p) PLATFORM_GET_INC(type, p, SaveType) -#define GET_UPDATE_VAR_TYPE_INC(type,p) PLATFORM_GET_INC(type, p, UpdateVarType) -#define GET_MODE_INC(mode,p) PLATFORM_GET_INC(mode, p, ModeType) - /* code point's address must be aligned address. */ #define GET_CODE_POINT(code,p) code = *((OnigCodePoint* )(p)) -#define GET_BYTE_INC(byte,p) do{\ - byte = *(p);\ - (p)++;\ -} while(0) /* op-code + arg size */ -#if 0 -#define SIZE_OP_ANYCHAR_STAR SIZE_OPCODE -#define SIZE_OP_ANYCHAR_STAR_PEEK_NEXT (SIZE_OPCODE + 1) -#define SIZE_OP_JUMP (SIZE_OPCODE + SIZE_RELADDR) -#define SIZE_OP_PUSH (SIZE_OPCODE + SIZE_RELADDR) -#define SIZE_OP_PUSH_SUPER (SIZE_OPCODE + SIZE_RELADDR) -#define SIZE_OP_POP_OUT SIZE_OPCODE -#ifdef USE_OP_PUSH_OR_JUMP_EXACT -#define SIZE_OP_PUSH_OR_JUMP_EXACT1 (SIZE_OPCODE + SIZE_RELADDR + 1) -#endif -#define SIZE_OP_PUSH_IF_PEEK_NEXT (SIZE_OPCODE + SIZE_RELADDR + 1) -#define SIZE_OP_REPEAT_INC (SIZE_OPCODE + SIZE_MEMNUM) -#define SIZE_OP_REPEAT_INC_NG (SIZE_OPCODE + SIZE_MEMNUM) -#define SIZE_OP_WORD_BOUNDARY (SIZE_OPCODE + SIZE_MODE) -#define SIZE_OP_PREC_READ_START SIZE_OPCODE -#define SIZE_OP_PREC_READ_NOT_START (SIZE_OPCODE + SIZE_RELADDR) -#define SIZE_OP_PREC_READ_END SIZE_OPCODE -#define SIZE_OP_PREC_READ_NOT_END SIZE_OPCODE -#define SIZE_OP_FAIL SIZE_OPCODE -#define SIZE_OP_MEMORY_START (SIZE_OPCODE + SIZE_MEMNUM) -#define SIZE_OP_MEMORY_START_PUSH (SIZE_OPCODE + SIZE_MEMNUM) -#define SIZE_OP_MEMORY_END_PUSH (SIZE_OPCODE + SIZE_MEMNUM) -#define SIZE_OP_MEMORY_END_PUSH_REC (SIZE_OPCODE + SIZE_MEMNUM) -#define SIZE_OP_MEMORY_END (SIZE_OPCODE + SIZE_MEMNUM) -#define SIZE_OP_MEMORY_END_REC (SIZE_OPCODE + SIZE_MEMNUM) -#define SIZE_OP_ATOMIC_START SIZE_OPCODE -#define SIZE_OP_ATOMIC_END SIZE_OPCODE -#define SIZE_OP_EMPTY_CHECK_START (SIZE_OPCODE + SIZE_MEMNUM) -#define SIZE_OP_EMPTY_CHECK_END (SIZE_OPCODE + SIZE_MEMNUM) -#define SIZE_OP_LOOK_BEHIND (SIZE_OPCODE + SIZE_LENGTH) -#define SIZE_OP_LOOK_BEHIND_NOT_START (SIZE_OPCODE + SIZE_RELADDR + SIZE_LENGTH) -#define SIZE_OP_LOOK_BEHIND_NOT_END SIZE_OPCODE -#define SIZE_OP_CALL (SIZE_OPCODE + SIZE_ABSADDR) -#define SIZE_OP_RETURN SIZE_OPCODE -#define SIZE_OP_PUSH_SAVE_VAL (SIZE_OPCODE + SIZE_SAVE_TYPE + SIZE_MEMNUM) -#define SIZE_OP_UPDATE_VAR (SIZE_OPCODE + SIZE_UPDATE_VAR_TYPE + SIZE_MEMNUM) - -#ifdef USE_CALLOUT -#define SIZE_OP_CALLOUT_CONTENTS (SIZE_OPCODE + SIZE_MEMNUM) -#define SIZE_OP_CALLOUT_NAME (SIZE_OPCODE + SIZE_MEMNUM + SIZE_MEMNUM) -#endif - -#else /* if 0 */ /* for relative address increment to go next op. */ -#define SIZE_INC_OP 1 - -#define SIZE_OP_ANYCHAR_STAR 1 -#define SIZE_OP_ANYCHAR_STAR_PEEK_NEXT 1 -#define SIZE_OP_JUMP 1 -#define SIZE_OP_PUSH 1 -#define SIZE_OP_PUSH_SUPER 1 -#define SIZE_OP_POP_OUT 1 +#define SIZE_INC 1 + +#define OPSIZE_ANYCHAR_STAR 1 +#define OPSIZE_ANYCHAR_STAR_PEEK_NEXT 1 +#define OPSIZE_JUMP 1 +#define OPSIZE_PUSH 1 +#define OPSIZE_PUSH_SUPER 1 +#define OPSIZE_POP_OUT 1 #ifdef USE_OP_PUSH_OR_JUMP_EXACT -#define SIZE_OP_PUSH_OR_JUMP_EXACT1 1 -#endif -#define SIZE_OP_PUSH_IF_PEEK_NEXT 1 -#define SIZE_OP_REPEAT 1 -#define SIZE_OP_REPEAT_INC 1 -#define SIZE_OP_REPEAT_INC_NG 1 -#define SIZE_OP_WORD_BOUNDARY 1 -#define SIZE_OP_PREC_READ_START 1 -#define SIZE_OP_PREC_READ_NOT_START 1 -#define SIZE_OP_PREC_READ_END 1 -#define SIZE_OP_PREC_READ_NOT_END 1 -#define SIZE_OP_BACKREF 1 -#define SIZE_OP_FAIL 1 -#define SIZE_OP_MEMORY_START 1 -#define SIZE_OP_MEMORY_START_PUSH 1 -#define SIZE_OP_MEMORY_END_PUSH 1 -#define SIZE_OP_MEMORY_END_PUSH_REC 1 -#define SIZE_OP_MEMORY_END 1 -#define SIZE_OP_MEMORY_END_REC 1 -#define SIZE_OP_ATOMIC_START 1 -#define SIZE_OP_ATOMIC_END 1 -#define SIZE_OP_EMPTY_CHECK_START 1 -#define SIZE_OP_EMPTY_CHECK_END 1 -#define SIZE_OP_LOOK_BEHIND 1 -#define SIZE_OP_LOOK_BEHIND_NOT_START 1 -#define SIZE_OP_LOOK_BEHIND_NOT_END 1 -#define SIZE_OP_CALL 1 -#define SIZE_OP_RETURN 1 -#define SIZE_OP_PUSH_SAVE_VAL 1 -#define SIZE_OP_UPDATE_VAR 1 +#define OPSIZE_PUSH_OR_JUMP_EXACT1 1 +#endif +#define OPSIZE_PUSH_IF_PEEK_NEXT 1 +#define OPSIZE_REPEAT 1 +#define OPSIZE_REPEAT_INC 1 +#define OPSIZE_REPEAT_INC_NG 1 +#define OPSIZE_WORD_BOUNDARY 1 +#define OPSIZE_PREC_READ_START 1 +#define OPSIZE_PREC_READ_NOT_START 1 +#define OPSIZE_PREC_READ_END 1 +#define OPSIZE_PREC_READ_NOT_END 1 +#define OPSIZE_BACKREF 1 +#define OPSIZE_FAIL 1 +#define OPSIZE_MEM_START 1 +#define OPSIZE_MEM_START_PUSH 1 +#define OPSIZE_MEM_END_PUSH 1 +#define OPSIZE_MEM_END_PUSH_REC 1 +#define OPSIZE_MEM_END 1 +#define OPSIZE_MEM_END_REC 1 +#define OPSIZE_ATOMIC_START 1 +#define OPSIZE_ATOMIC_END 1 +#define OPSIZE_EMPTY_CHECK_START 1 +#define OPSIZE_EMPTY_CHECK_END 1 +#define OPSIZE_LOOK_BEHIND 1 +#define OPSIZE_LOOK_BEHIND_NOT_START 1 +#define OPSIZE_LOOK_BEHIND_NOT_END 1 +#define OPSIZE_CALL 1 +#define OPSIZE_RETURN 1 +#define OPSIZE_PUSH_SAVE_VAL 1 +#define OPSIZE_UPDATE_VAR 1 #ifdef USE_CALLOUT -#define SIZE_OP_CALLOUT_CONTENTS 1 -#define SIZE_OP_CALLOUT_NAME 1 +#define OPSIZE_CALLOUT_CONTENTS 1 +#define OPSIZE_CALLOUT_NAME 1 #endif -#endif /* if 0 */ #define MC_ESC(syn) (syn)->meta_char_table.esc @@ -882,7 +786,7 @@ typedef struct { } repeat; /* REPEAT, REPEAT_NG */ struct { MemNumType id; - } repeat_inc; /* REPEAT_INC, REPEAT_INC_SG, REPEAT_INC_NG, REPEAT_INC_NG_SG */ + } repeat_inc; /* REPEAT_INC, REPEAT_INC_NG */ struct { MemNumType mem; } empty_check_start; @@ -933,48 +837,58 @@ typedef struct { #endif } RegexExt; +typedef struct { + int lower; + int upper; + union { + Operation* pcode; /* address of repeated body */ + int offset; + } u; +} RepeatRange; + struct re_pattern_buffer { /* common members of BBuf(bytes-buffer) */ Operation* ops; #ifdef USE_DIRECT_THREADED_CODE enum OpCode* ocs; #endif - Operation* ops_curr; - unsigned int ops_used; /* used space for ops */ - unsigned int ops_alloc; /* allocated space for ops */ + Operation* ops_curr; + unsigned int ops_used; /* used space for ops */ + unsigned int ops_alloc; /* allocated space for ops */ unsigned char* string_pool; unsigned char* string_pool_end; - int num_mem; /* used memory(...) num counted from 1 */ - int num_repeat; /* OP_REPEAT/OP_REPEAT_NG id-counter */ - int num_null_check; /* OP_EMPTY_CHECK_START/END id counter */ - int num_call; /* number of subexp call */ - unsigned int capture_history; /* (?@...) flag (1-31) */ - unsigned int bt_mem_start; /* need backtrack flag */ - unsigned int bt_mem_end; /* need backtrack flag */ - int stack_pop_level; - int repeat_range_alloc; - OnigRepeatRange* repeat_range; - - OnigEncoding enc; - OnigOptionType options; - OnigSyntaxType* syntax; - OnigCaseFoldType case_fold_flag; - void* name_table; + int num_mem; /* used memory(...) num counted from 1 */ + int num_repeat; /* OP_REPEAT/OP_REPEAT_NG id-counter */ + int num_empty_check; /* OP_EMPTY_CHECK_START/END id counter */ + int num_call; /* number of subexp call */ + MemStatusType capture_history; /* (?@...) flag (1-31) */ + MemStatusType push_mem_start; /* need backtrack flag */ + MemStatusType push_mem_end; /* need backtrack flag */ + MemStatusType empty_status_mem; + int stack_pop_level; + int repeat_range_alloc; + RepeatRange* repeat_range; + + OnigEncoding enc; + OnigOptionType options; + OnigSyntaxType* syntax; + OnigCaseFoldType case_fold_flag; + void* name_table; /* optimization info (string search, char-map and anchors) */ int optimize; /* optimize flag */ int threshold_len; /* search str-length for apply optimize */ int anchor; /* BEGIN_BUF, BEGIN_POS, (SEMI_)END_BUF */ - OnigLen anchor_dmin; /* (SEMI_)END_BUF anchor distance */ - OnigLen anchor_dmax; /* (SEMI_)END_BUF anchor distance */ + OnigLen anc_dist_min; /* (SEMI_)END_BUF anchor distance */ + OnigLen anc_dist_max; /* (SEMI_)END_BUF anchor distance */ int sub_anchor; /* start-anchor for exact or map */ unsigned char *exact; unsigned char *exact_end; unsigned char map[CHAR_MAP_SIZE]; /* used as BMH skip or char-map */ int map_offset; - OnigLen dmin; /* min-distance of exact or map */ - OnigLen dmax; /* max-distance of exact or map */ + OnigLen dist_min; /* min-distance of exact or map */ + OnigLen dist_max; /* max-distance of exact or map */ RegexExt* extp; }; diff --git a/src/regparse.c b/src/regparse.c index 7f8b1a9..fed53f7 100644 --- a/src/regparse.c +++ b/src/regparse.c @@ -2,7 +2,7 @@ regparse.c - Oniguruma (regular expression library) **********************************************************************/ /*- - * Copyright (c) 2002-2019 K.Kosako + * Copyright (c) 2002-2019 K.Kosako * All rights reserved. * * Redistribution and use in source and binary forms, with or without @@ -199,6 +199,24 @@ onig_set_parse_depth_limit(unsigned int depth) return 0; } +#ifdef ONIG_DEBUG_PARSE +#define INC_PARSE_DEPTH(d) do {\ + (d)++;\ + if (env->max_parse_depth < (d)) env->max_parse_depth = d;\ + if ((d) > ParseDepthLimit) \ + return ONIGERR_PARSE_DEPTH_LIMIT_OVER;\ +} while (0) +#else +#define INC_PARSE_DEPTH(d) do {\ + (d)++;\ + if ((d) > ParseDepthLimit) \ + return ONIGERR_PARSE_DEPTH_LIMIT_OVER;\ +} while (0) +#endif + +#define DEC_PARSE_DEPTH(d) (d)-- + + static int bbuf_init(BBuf* buf, int size) { @@ -244,7 +262,8 @@ bbuf_clone(BBuf** rto, BBuf* from) return 0; } -static int backref_rel_to_abs(int rel_no, ScanEnv* env) +static int +backref_rel_to_abs(int rel_no, ScanEnv* env) { if (rel_no > 0) { return env->num_mem + rel_no; @@ -292,15 +311,6 @@ bitset_set_range(BitSetRef bs, int from, int to) } } -#if 0 -static void -bitset_set_all(BitSetRef bs) -{ - int i; - for (i = 0; i < BITSET_SIZE; i++) { bs[i] = ~((Bits )0); } -} -#endif - static void bitset_invert(BitSetRef bs) { @@ -363,24 +373,6 @@ save_entry(ScanEnv* env, enum SaveType type, int* id) { int nid = env->save_num; -#if 0 - if (IS_NULL(env->saves)) { - int n = 10; - env->saves = (SaveItem* )xmalloc(sizeof(SaveItem) * n); - CHECK_NULL_RETURN_MEMERR(env->saves); - env->save_alloc_num = n; - } - else if (env->save_alloc_num <= nid) { - int n = env->save_alloc_num * 2; - SaveItem* p = (SaveItem* )xrealloc(env->saves, sizeof(SaveItem) * n); - CHECK_NULL_RETURN_MEMERR(p); - env->saves = p; - env->save_alloc_num = n; - } - - env->saves[nid].type = type; -#endif - env->save_num++; *id = nid; return 0; @@ -476,14 +468,14 @@ static int str_end_hash(st_str_end_key* x) { UChar *p; - int val = 0; + unsigned val = 0; p = x->s; while (p < x->end) { - val = val * 997 + (int )*p++; + val = val * 997 + (unsigned )*p++; } - return val + (val >> 5); + return (int) (val + (val >> 5)); } extern hash_table_type* @@ -566,15 +558,15 @@ static int callout_name_table_hash(st_callout_name_key* x) { UChar *p; - int val = 0; + unsigned int val = 0; p = x->s; while (p < x->end) { - val = val * 997 + (int )*p++; + val = val * 997 + (unsigned int )*p++; } /* use intptr_t for escape warning in Windows */ - return val + (val >> 5) + ((intptr_t )x->enc & 0xffff) + x->type; + return (int )(val + (val >> 5) + ((intptr_t )x->enc & 0xffff) + x->type); } extern hash_table_type* @@ -1972,9 +1964,8 @@ callout_tag_entry(ScanEnv* env, regex_t* reg, UChar* name, UChar* name_end, static void scan_env_clear(ScanEnv* env) { - MEM_STATUS_CLEAR(env->capture_history); - MEM_STATUS_CLEAR(env->bt_mem_start); - MEM_STATUS_CLEAR(env->bt_mem_end); + MEM_STATUS_CLEAR(env->cap_history); + MEM_STATUS_CLEAR(env->backtrack_mem); MEM_STATUS_CLEAR(env->backrefed_mem); env->error = (UChar* )NULL; env->error_end = (UChar* )NULL; @@ -1993,6 +1984,10 @@ scan_env_clear(ScanEnv* env) xmemset(env->mem_env_static, 0, sizeof(env->mem_env_static)); env->parse_depth = 0; +#ifdef ONIG_DEBUG_PARSE + env->max_parse_depth = 0; +#endif + env->backref_num = 0; env->keep_num = 0; env->save_num = 0; env->save_alloc_num = 0; @@ -2024,11 +2019,8 @@ scan_env_add_mem_entry(ScanEnv* env) } for (i = env->num_mem + 1; i < alloc; i++) { - p[i].node = NULL_NODE; -#if 0 - p[i].in = 0; - p[i].recursion = 0; -#endif + p[i].mem_node = NULL_NODE; + p[i].empty_repeat_node = NULL_NODE; } env->mem_env_dynamic = p; @@ -2044,7 +2036,7 @@ static int scan_env_set_mem_node(ScanEnv* env, int num, Node* node) { if (env->num_mem >= num) - SCANENV_MEMENV(env)[num].node = node; + SCANENV_MEMENV(env)[num].mem_node = node; else return ONIGERR_PARSER_BUG; return 0; @@ -2182,7 +2174,7 @@ node_new_ctype(int type, int not, OnigOptionType options) static Node* node_new_anychar(void) { - Node* node = node_new_ctype(CTYPE_ANYCHAR, 0, ONIG_OPTION_NONE); + Node* node = node_new_ctype(CTYPE_ANYCHAR, FALSE, ONIG_OPTION_NONE); return node; } @@ -2241,24 +2233,6 @@ onig_node_new_list(Node* left, Node* right) return node_new_list(left, right); } -extern Node* -onig_node_list_add(Node* list, Node* x) -{ - Node *n; - - n = onig_node_new_list(x, NULL); - if (IS_NULL(n)) return NULL_NODE; - - if (IS_NOT_NULL(list)) { - while (IS_NOT_NULL(NODE_CDR(list))) - list = NODE_CDR(list); - - NODE_CDR(list) = n; - } - - return n; -} - extern Node* onig_node_new_alt(Node* left, Node* right) { @@ -2357,7 +2331,7 @@ node_new_backref(int back_num, int* backrefs, int by_name, for (i = 0; i < back_num; i++) { if (backrefs[i] <= env->num_mem && - IS_NULL(SCANENV_MEMENV(env)[backrefs[i]].node)) { + IS_NULL(SCANENV_MEMENV(env)[backrefs[i]].mem_node)) { NODE_STATUS_ADD(node, RECURSION); /* /...(\1).../ */ break; } @@ -2377,6 +2351,8 @@ node_new_backref(int back_num, int* backrefs, int by_name, for (i = 0; i < back_num; i++) p[i] = backrefs[i]; } + + env->backref_num++; return node; } @@ -2424,13 +2400,13 @@ node_new_quantifier(int lower, int upper, int by_number) CHECK_NULL_RETURN(node); NODE_SET_TYPE(node, NODE_QUANT); - QUANT_(node)->lower = lower; - QUANT_(node)->upper = upper; - QUANT_(node)->greedy = 1; - QUANT_(node)->emptiness = BODY_IS_NOT_EMPTY; - QUANT_(node)->head_exact = NULL_NODE; - QUANT_(node)->next_head_exact = NULL_NODE; - QUANT_(node)->is_refered = 0; + QUANT_(node)->lower = lower; + QUANT_(node)->upper = upper; + QUANT_(node)->greedy = 1; + QUANT_(node)->emptiness = BODY_IS_NOT_EMPTY; + QUANT_(node)->head_exact = NULL_NODE; + QUANT_(node)->next_head_exact = NULL_NODE; + QUANT_(node)->include_referred = 0; if (by_number != 0) NODE_STATUS_ADD(node, BY_NUMBER); @@ -2716,7 +2692,7 @@ make_text_segment(Node** node, ScanEnv* env) ns[1] = NULL_NODE; r = ONIGERR_MEMORY; - ns[0] = onig_node_new_anchor(ANCR_NO_TEXT_SEGMENT_BOUNDARY, 0); + ns[0] = onig_node_new_anchor(ANCR_NO_TEXT_SEGMENT_BOUNDARY, FALSE); if (IS_NULL(ns[0])) goto err; r = node_new_true_anychar(&ns[1], env); @@ -2727,7 +2703,7 @@ make_text_segment(Node** node, ScanEnv* env) ns[0] = x; ns[1] = NULL_NODE; - x = node_new_quantifier(0, INFINITE_REPEAT, 1); + x = node_new_quantifier(0, INFINITE_REPEAT, TRUE); if (IS_NULL(x)) goto err; NODE_BODY(x) = ns[0]; @@ -2796,7 +2772,7 @@ make_absent_engine(Node** node, int pre_save_right_id, Node* absent, ns[0] = x; - x = node_new_quantifier(lower, upper, 0); + x = node_new_quantifier(lower, upper, FALSE); if (IS_NULL(x)) goto err0; NODE_BODY(x) = ns[0]; @@ -2825,7 +2801,7 @@ make_absent_engine(Node** node, int pre_save_right_id, Node* absent, x = make_alt(2, ns); if (IS_NULL(x)) goto err0; - if (is_range_cutter != 0) + if (is_range_cutter != FALSE) NODE_STATUS_ADD(x, SUPER); *node = x; @@ -2915,7 +2891,10 @@ make_range_clear(Node** node, ScanEnv* env) ns[0] = NULL_NODE; ns[1] = x; - r = node_new_update_var_gimmick(&ns[0], UPDATE_VAR_RIGHT_RANGE_INIT, 0, env); +#define ID_NOT_USED_DONT_CARE_ME 0 + + r = node_new_update_var_gimmick(&ns[0], UPDATE_VAR_RIGHT_RANGE_INIT, + ID_NOT_USED_DONT_CARE_ME, env); if (r != 0) goto err; x = make_alt(2, ns); @@ -3034,7 +3013,7 @@ make_absent_tree_for_simple_one_char_repeat(Node** node, Node* absent, Node* qua id1 = GIMMICK_(ns[0])->id; r = make_absent_engine(&ns[1], id1, absent, body, lower, upper, possessive, - 0, env); + FALSE, env); if (r != 0) goto err; ns[2] = ns[3] = NULL_NODE; @@ -3077,7 +3056,7 @@ make_absent_tree(Node** node, Node* absent, Node* expr, int is_range_cutter, if (expr == NULL_NODE) { /* default expr \O* */ - quant = node_new_quantifier(0, INFINITE_REPEAT, 0); + quant = node_new_quantifier(0, INFINITE_REPEAT, FALSE); if (IS_NULL(quant)) goto err0; r = node_new_true_anychar(&body, env); @@ -3203,16 +3182,6 @@ node_str_cat_char(Node* node, UChar c) return onig_node_str_cat(node, s, s + 1); } -extern void -onig_node_conv_to_str_node(Node* node, int flag) -{ - NODE_SET_TYPE(node, NODE_STRING); - STR_(node)->flag = flag; - STR_(node)->capacity = 0; - STR_(node)->s = STR_(node)->buf; - STR_(node)->end = STR_(node)->buf; -} - extern void onig_node_str_clear(Node* node) { @@ -3221,10 +3190,11 @@ onig_node_str_clear(Node* node) xfree(STR_(node)->s); } - STR_(node)->capacity = 0; STR_(node)->flag = 0; STR_(node)->s = STR_(node)->buf; STR_(node)->end = STR_(node)->buf; + STR_(node)->capacity = 0; + STR_(node)->case_min_len = 0; } static Node* @@ -3234,10 +3204,12 @@ node_new_str(const UChar* s, const UChar* end) CHECK_NULL_RETURN(node); NODE_SET_TYPE(node, NODE_STRING); - STR_(node)->capacity = 0; STR_(node)->flag = 0; STR_(node)->s = STR_(node)->buf; STR_(node)->end = STR_(node)->buf; + STR_(node)->capacity = 0; + STR_(node)->case_min_len = 0; + if (onig_node_str_cat(node, s, end)) { onig_node_free(node); return NULL; @@ -3252,11 +3224,11 @@ onig_node_new_str(const UChar* s, const UChar* end) } static Node* -node_new_str_raw(UChar* s, UChar* end) +node_new_str_crude(UChar* s, UChar* end) { Node* node = node_new_str(s, end); CHECK_NULL_RETURN(node); - NODE_STRING_SET_RAW(node); + NODE_STRING_SET_CRUDE(node); return node; } @@ -3267,14 +3239,14 @@ node_new_empty(void) } static Node* -node_new_str_raw_char(UChar c) +node_new_str_crude_char(UChar c) { int i; UChar p[1]; Node* node; p[0] = c; - node = node_new_str_raw(p, p + 1); + node = node_new_str_crude(p, p + 1); /* clear buf tail */ for (i = 1; i < NODE_STRING_BUF_SIZE; i++) @@ -3297,8 +3269,8 @@ str_node_split_last_char(Node* node, OnigEncoding enc) if (p && p > sn->s) { /* can be split. */ rn = node_new_str(p, sn->end); CHECK_NULL_RETURN(rn); - if (NODE_STRING_IS_RAW(node)) - NODE_STRING_SET_RAW(rn); + if (NODE_STRING_IS_CRUDE(node)) + NODE_STRING_SET_CRUDE(rn); sn->end = (UChar* )p; } @@ -3316,10 +3288,10 @@ str_node_can_be_split(Node* node, OnigEncoding enc) return 0; } -extern int -onig_scan_unsigned_number(UChar** src, const UChar* end, OnigEncoding enc) +static int +scan_number(UChar** src, const UChar* end, OnigEncoding enc) { - unsigned int num, val; + int num, val; OnigCodePoint c; UChar* p = *src; PFETCH_READY; @@ -3328,8 +3300,8 @@ onig_scan_unsigned_number(UChar** src, const UChar* end, OnigEncoding enc) while (! PEND) { PFETCH(c); if (IS_CODE_DIGIT_ASCII(enc, c)) { - val = (unsigned int )DIGITVAL(c); - if ((INT_MAX_LIMIT - val) / 10UL < num) + val = (int )DIGITVAL(c); + if ((INT_MAX - val) / 10 < num) return -1; /* overflow */ num = num * 10 + val; @@ -3344,26 +3316,27 @@ onig_scan_unsigned_number(UChar** src, const UChar* end, OnigEncoding enc) } static int -scan_unsigned_hexadecimal_number(UChar** src, UChar* end, int minlen, - int maxlen, OnigEncoding enc) +scan_hexadecimal_number(UChar** src, UChar* end, int minlen, int maxlen, + OnigEncoding enc, OnigCodePoint* rcode) { + OnigCodePoint code; OnigCodePoint c; - unsigned int num, val; + unsigned int val; int n; UChar* p = *src; PFETCH_READY; - num = 0; + code = 0; n = 0; while (! PEND && n < maxlen) { PFETCH(c); if (IS_CODE_XDIGIT_ASCII(enc, c)) { n++; - val = (unsigned int )XDIGITVAL(enc,c); - if ((INT_MAX_LIMIT - val) / 16UL < num) + val = (unsigned int )XDIGITVAL(enc, c); + if ((UINT_MAX - val) / 16UL < code) return ONIGERR_TOO_BIG_NUMBER; /* overflow */ - num = (num << 4) + XDIGITVAL(enc,c); + code = (code << 4) + val; } else { PUNFETCH; @@ -3374,36 +3347,46 @@ scan_unsigned_hexadecimal_number(UChar** src, UChar* end, int minlen, if (n < minlen) return ONIGERR_INVALID_CODE_POINT_VALUE; + *rcode = code; *src = p; - return num; + return ONIG_NORMAL; } static int -scan_unsigned_octal_number(UChar** src, UChar* end, int maxlen, - OnigEncoding enc) +scan_octal_number(UChar** src, UChar* end, int minlen, int maxlen, + OnigEncoding enc, OnigCodePoint* rcode) { + OnigCodePoint code; OnigCodePoint c; - unsigned int num, val; + unsigned int val; + int n; UChar* p = *src; PFETCH_READY; - num = 0; - while (! PEND && maxlen-- != 0) { + code = 0; + n = 0; + while (! PEND && n < maxlen) { PFETCH(c); if (IS_CODE_DIGIT_ASCII(enc, c) && c < '8') { - val = ODIGITVAL(c); - if ((INT_MAX_LIMIT - val) / 8UL < num) - return -1; /* overflow */ + n++; + val = (unsigned int )ODIGITVAL(c); + if ((UINT_MAX - val) / 8UL < code) + return ONIGERR_TOO_BIG_NUMBER; /* overflow */ - num = (num << 3) + val; + code = (code << 3) + val; } else { PUNFETCH; break; } } + + if (n < minlen) + return ONIGERR_INVALID_CODE_POINT_VALUE; + + *rcode = code; *src = p; - return num; + return ONIG_NORMAL; } @@ -3938,68 +3921,70 @@ static enum ReduceType ReduceTypeTable[6][6] = { {RQ_ASIS, RQ_PQ_Q, RQ_DEL, RQ_AQ, RQ_AQ, RQ_DEL} /* '+?' */ }; -extern void -onig_reduce_nested_quantifier(Node* pnode, Node* cnode) +extern int +onig_reduce_nested_quantifier(Node* pnode) { int pnum, cnum; QuantNode *p, *c; + Node* cnode; + + cnode = NODE_BODY(pnode); p = QUANT_(pnode); c = QUANT_(cnode); pnum = quantifier_type_num(p); cnum = quantifier_type_num(c); if (pnum < 0 || cnum < 0) { - if ((p->lower == p->upper) && ! IS_INFINITE_REPEAT(p->upper)) { - if ((c->lower == c->upper) && ! IS_INFINITE_REPEAT(c->upper)) { - int n = onig_positive_int_multiply(p->lower, c->lower); - if (n >= 0) { - p->lower = p->upper = n; - NODE_BODY(pnode) = NODE_BODY(cnode); - goto remove_cnode; - } - } + if (p->lower == p->upper && c->lower == c->upper) { + int n = onig_positive_int_multiply(p->lower, c->lower); + if (n < 0) return ONIGERR_TOO_BIG_NUMBER_FOR_REPEAT_RANGE; + + p->lower = p->upper = n; + NODE_BODY(pnode) = NODE_BODY(cnode); + goto remove_cnode; } - return ; + return 0; } switch(ReduceTypeTable[cnum][pnum]) { case RQ_DEL: *pnode = *cnode; + goto remove_cnode; break; case RQ_A: NODE_BODY(pnode) = NODE_BODY(cnode); p->lower = 0; p->upper = INFINITE_REPEAT; p->greedy = 1; + goto remove_cnode; break; case RQ_AQ: NODE_BODY(pnode) = NODE_BODY(cnode); p->lower = 0; p->upper = INFINITE_REPEAT; p->greedy = 0; + goto remove_cnode; break; case RQ_QQ: NODE_BODY(pnode) = NODE_BODY(cnode); p->lower = 0; p->upper = 1; p->greedy = 0; + goto remove_cnode; break; case RQ_P_QQ: - NODE_BODY(pnode) = cnode; p->lower = 0; p->upper = 1; p->greedy = 0; c->lower = 1; c->upper = INFINITE_REPEAT; c->greedy = 1; - return ; break; case RQ_PQ_Q: - NODE_BODY(pnode) = cnode; p->lower = 0; p->upper = 1; p->greedy = 1; c->lower = 1; c->upper = INFINITE_REPEAT; c->greedy = 0; - return ; break; case RQ_ASIS: - NODE_BODY(pnode) = cnode; - return ; break; } + return 0; + remove_cnode: NODE_BODY(cnode) = NULL_NODE; onig_node_free(cnode); + return 0; } static int @@ -4018,7 +4003,7 @@ node_new_general_newline(Node** node, ScanEnv* env) alen = ONIGENC_CODE_TO_MBC(env->enc, 0x0a, buf + dlen); if (alen < 0) return alen; - crnl = node_new_str_raw(buf, buf + dlen + alen); + crnl = node_new_str_crude(buf, buf + dlen + alen); CHECK_NULL_RETURN_MEMERR(crnl); ncc = node_new_cclass(); @@ -4046,7 +4031,7 @@ node_new_general_newline(Node** node, ScanEnv* env) if (r != 0) goto err1; } - x = node_new_bag_if_else(crnl, 0, ncc); + x = node_new_bag_if_else(crnl, NULL_NODE, ncc); if (IS_NULL(x)) goto err1; *node = x; @@ -4055,7 +4040,7 @@ node_new_general_newline(Node** node, ScanEnv* env) enum TokenSyms { TK_EOT = 0, /* end of token */ - TK_RAW_BYTE = 1, + TK_CRUDE_BYTE = 1, TK_CHAR, TK_STRING, TK_CODE_POINT, @@ -4070,7 +4055,7 @@ enum TokenSyms { TK_ALT, TK_SUBEXP_OPEN, TK_SUBEXP_CLOSE, - TK_CC_OPEN, + TK_OPEN_CC, TK_QUOTE_OPEN, TK_CHAR_PROPERTY, /* \p{...}, \P{...} */ TK_KEEP, /* \K */ @@ -4082,9 +4067,9 @@ enum TokenSyms { /* in cc */ TK_CC_CLOSE, TK_CC_RANGE, - TK_POSIX_BRACKET_OPEN, - TK_CC_AND, /* && */ - TK_CC_CC_OPEN /* [ */ + TK_CC_POSIX_BRACKET_OPEN, + TK_CC_AND, /* && */ + TK_CC_OPEN_CC /* [ */ }; typedef struct { @@ -4094,7 +4079,7 @@ typedef struct { UChar* backp; union { UChar* s; - int c; + UChar byte; OnigCodePoint code; int anchor; int subtype; @@ -4129,7 +4114,7 @@ typedef struct { static int -fetch_interval_quantifier(UChar** src, UChar* end, PToken* tok, ScanEnv* env) +fetch_interval(UChar** src, UChar* end, PToken* tok, ScanEnv* env) { int low, up, syn_allow, non_low = 0; int r = 0; @@ -4154,7 +4139,7 @@ fetch_interval_quantifier(UChar** src, UChar* end, PToken* tok, ScanEnv* env) } } - low = onig_scan_unsigned_number(&p, end, env->enc); + low = scan_number(&p, end, env->enc); if (low < 0) return ONIGERR_TOO_BIG_NUMBER_FOR_REPEAT_RANGE; if (low > ONIG_MAX_REPEAT_NUM) return ONIGERR_TOO_BIG_NUMBER_FOR_REPEAT_RANGE; @@ -4173,7 +4158,7 @@ fetch_interval_quantifier(UChar** src, UChar* end, PToken* tok, ScanEnv* env) PFETCH(c); if (c == ',') { UChar* prev = p; - up = onig_scan_unsigned_number(&p, end, env->enc); + up = scan_number(&p, end, env->enc); if (up < 0) return ONIGERR_TOO_BIG_NUMBER_FOR_REPEAT_RANGE; if (up > ONIG_MAX_REPEAT_NUM) return ONIGERR_TOO_BIG_NUMBER_FOR_REPEAT_RANGE; @@ -4196,7 +4181,7 @@ fetch_interval_quantifier(UChar** src, UChar* end, PToken* tok, ScanEnv* env) if (PEND) goto invalid; PFETCH(c); if (IS_SYNTAX_OP(env->syntax, ONIG_SYN_OP_ESC_BRACE_INTERVAL)) { - if (c != MC_ESC(env->syntax)) goto invalid; + if (c != MC_ESC(env->syntax) || PEND) goto invalid; PFETCH(c); } if (c != '}') goto invalid; @@ -4419,7 +4404,7 @@ fetch_name_with_level(OnigCodePoint start_code, UChar** src, UChar* end, PFETCH(c); if (! IS_CODE_DIGIT_ASCII(enc, c)) goto err; PUNFETCH; - level = onig_scan_unsigned_number(&p, end, enc); + level = scan_number(&p, end, enc); if (level < 0) return ONIGERR_TOO_BIG_NUMBER; *rlevel = (level * flag); exist_level = 1; @@ -4440,7 +4425,7 @@ fetch_name_with_level(OnigCodePoint start_code, UChar** src, UChar* end, end: if (r == 0) { if (*num_type != IS_NOT_NUM) { - *rback_num = onig_scan_unsigned_number(&pnum_head, name_end, enc); + *rback_num = scan_number(&pnum_head, name_end, enc); if (*rback_num < 0) return ONIGERR_TOO_BIG_NUMBER; else if (*rback_num == 0) { if (*num_type == IS_REL_NUM) @@ -4468,7 +4453,7 @@ fetch_name_with_level(OnigCodePoint start_code, UChar** src, UChar* end, static int fetch_name(OnigCodePoint start_code, UChar** src, UChar* end, UChar** rname_end, ScanEnv* env, int* rback_num, - enum REF_NUM* num_type, int ref) + enum REF_NUM* num_type, int is_ref) { int r, sign; int digit_count; @@ -4498,7 +4483,7 @@ fetch_name(OnigCodePoint start_code, UChar** src, UChar* end, return ONIGERR_EMPTY_GROUP_NAME; if (IS_CODE_DIGIT_ASCII(enc, c)) { - if (ref == 1) + if (is_ref == TRUE) *num_type = IS_ABS_NUM; else { r = ONIGERR_INVALID_GROUP_NAME; @@ -4506,7 +4491,7 @@ fetch_name(OnigCodePoint start_code, UChar** src, UChar* end, digit_count++; } else if (c == '-') { - if (ref == 1) { + if (is_ref == TRUE) { *num_type = IS_REL_NUM; sign = -1; pnum_head = p; @@ -4516,7 +4501,7 @@ fetch_name(OnigCodePoint start_code, UChar** src, UChar* end, } } else if (c == '+') { - if (ref == 1) { + if (is_ref == TRUE) { *num_type = IS_REL_NUM; sign = 1; pnum_head = p; @@ -4566,7 +4551,7 @@ fetch_name(OnigCodePoint start_code, UChar** src, UChar* end, } if (*num_type != IS_NOT_NUM) { - *rback_num = onig_scan_unsigned_number(&pnum_head, name_end, enc); + *rback_num = scan_number(&pnum_head, name_end, enc); if (*rback_num < 0) return ONIGERR_TOO_BIG_NUMBER; else if (*rback_num == 0) { if (*num_type == IS_REL_NUM) { @@ -4698,7 +4683,8 @@ str_exist_check_with_esc(OnigCodePoint s[], int n, UChar* from, UChar* to, static int fetch_token_in_cc(PToken* tok, UChar** src, UChar* end, ScanEnv* env) { - int num; + int r; + OnigCodePoint code; OnigCodePoint c, c2; OnigSyntaxType* syn = env->syntax; OnigEncoding enc = env->enc; @@ -4714,7 +4700,7 @@ fetch_token_in_cc(PToken* tok, UChar** src, UChar* end, ScanEnv* env) PFETCH(c); tok->type = TK_CHAR; tok->base = 0; - tok->u.c = c; + tok->u.code = c; tok->escaped = 0; if (c == ']') { @@ -4731,7 +4717,7 @@ fetch_token_in_cc(PToken* tok, UChar** src, UChar* end, ScanEnv* env) PFETCH(c); tok->escaped = 1; - tok->u.c = c; + tok->u.code = c; switch (c) { case 'w': tok->type = TK_CHAR_TYPE; @@ -4804,8 +4790,8 @@ fetch_token_in_cc(PToken* tok, UChar** src, UChar* end, ScanEnv* env) prev = p; if (PPEEK_IS('{') && IS_SYNTAX_OP(syn, ONIG_SYN_OP_ESC_O_BRACE_OCTAL)) { PINC; - num = scan_unsigned_octal_number(&p, end, 11, enc); - if (num < 0) return ONIGERR_TOO_BIG_WIDE_CHAR_VALUE; + r = scan_octal_number(&p, end, 0, 11, enc, &code); + if (r < 0) return r; if (!PEND) { c2 = PPEEK; if (IS_CODE_DIGIT_ASCII(enc, c2)) @@ -4816,7 +4802,7 @@ fetch_token_in_cc(PToken* tok, UChar** src, UChar* end, ScanEnv* env) PINC; tok->type = TK_CODE_POINT; tok->base = 8; - tok->u.code = (OnigCodePoint )num; + tok->u.code = code; } else { /* can't read nothing or invalid format */ @@ -4831,13 +4817,8 @@ fetch_token_in_cc(PToken* tok, UChar** src, UChar* end, ScanEnv* env) prev = p; if (PPEEK_IS('{') && IS_SYNTAX_OP(syn, ONIG_SYN_OP_ESC_X_BRACE_HEX8)) { PINC; - num = scan_unsigned_hexadecimal_number(&p, end, 0, 8, enc); - if (num < 0) { - if (num == ONIGERR_TOO_BIG_NUMBER) - return ONIGERR_TOO_BIG_WIDE_CHAR_VALUE; - else - return num; - } + r = scan_hexadecimal_number(&p, end, 0, 8, enc, &code); + if (r < 0) return r; if (!PEND) { c2 = PPEEK; if (IS_CODE_XDIGIT_ASCII(enc, c2)) @@ -4848,7 +4829,7 @@ fetch_token_in_cc(PToken* tok, UChar** src, UChar* end, ScanEnv* env) PINC; tok->type = TK_CODE_POINT; tok->base = 16; - tok->u.code = (OnigCodePoint )num; + tok->u.code = code; } else { /* can't read nothing or invalid format */ @@ -4856,14 +4837,14 @@ fetch_token_in_cc(PToken* tok, UChar** src, UChar* end, ScanEnv* env) } } else if (IS_SYNTAX_OP(syn, ONIG_SYN_OP_ESC_X_HEX2)) { - num = scan_unsigned_hexadecimal_number(&p, end, 0, 2, enc); - if (num < 0) return num; + r = scan_hexadecimal_number(&p, end, 0, 2, enc, &code); + if (r < 0) return r; if (p == prev) { /* can't read nothing. */ - num = 0; /* but, it's not error */ + code = 0; /* but, it's not error */ } - tok->type = TK_RAW_BYTE; + tok->type = TK_CRUDE_BYTE; tok->base = 16; - tok->u.c = num; + tok->u.byte = (UChar )code; } break; @@ -4872,14 +4853,14 @@ fetch_token_in_cc(PToken* tok, UChar** src, UChar* end, ScanEnv* env) prev = p; if (IS_SYNTAX_OP2(syn, ONIG_SYN_OP2_ESC_U_HEX4)) { - num = scan_unsigned_hexadecimal_number(&p, end, 4, 4, enc); - if (num < 0) return num; + r = scan_hexadecimal_number(&p, end, 4, 4, enc, &code); + if (r < 0) return r; if (p == prev) { /* can't read nothing. */ - num = 0; /* but, it's not error */ + code = 0; /* but, it's not error */ } tok->type = TK_CODE_POINT; tok->base = 16; - tok->u.code = (OnigCodePoint )num; + tok->u.code = code; } break; @@ -4888,22 +4869,23 @@ fetch_token_in_cc(PToken* tok, UChar** src, UChar* end, ScanEnv* env) if (IS_SYNTAX_OP(syn, ONIG_SYN_OP_ESC_OCTAL3)) { PUNFETCH; prev = p; - num = scan_unsigned_octal_number(&p, end, 3, enc); - if (num < 0 || num >= 256) return ONIGERR_TOO_BIG_NUMBER; + r = scan_octal_number(&p, end, 0, 3, enc, &code); + if (r < 0) return r; + if (code >= 256) return ONIGERR_TOO_BIG_NUMBER; if (p == prev) { /* can't read nothing. */ - num = 0; /* but, it's not error */ + code = 0; /* but, it's not error */ } - tok->type = TK_RAW_BYTE; + tok->type = TK_CRUDE_BYTE; tok->base = 8; - tok->u.c = num; + tok->u.byte = (UChar )code; } break; default: PUNFETCH; - num = fetch_escaped_value(&p, end, env, &c2); - if (num < 0) return num; - if (tok->u.c != c2) { + r = fetch_escaped_value(&p, end, env, &c2); + if (r < 0) return r; + if (tok->u.code != c2) { tok->u.code = c2; tok->type = TK_CODE_POINT; } @@ -4917,7 +4899,7 @@ fetch_token_in_cc(PToken* tok, UChar** src, UChar* end, ScanEnv* env) PINC; if (str_exist_check_with_esc(send, 2, p, end, (OnigCodePoint )']', enc, syn)) { - tok->type = TK_POSIX_BRACKET_OPEN; + tok->type = TK_CC_POSIX_BRACKET_OPEN; } else { PUNFETCH; @@ -4927,7 +4909,7 @@ fetch_token_in_cc(PToken* tok, UChar** src, UChar* end, ScanEnv* env) else { cc_in_cc: if (IS_SYNTAX_OP2(syn, ONIG_SYN_OP2_CCLASS_SET_OP)) { - tok->type = TK_CC_CC_OPEN; + tok->type = TK_CC_OPEN_CC; } else { CC_ESC_WARN(env, (UChar* )"["); @@ -4950,7 +4932,8 @@ fetch_token_in_cc(PToken* tok, UChar** src, UChar* end, ScanEnv* env) static int fetch_token(PToken* tok, UChar** src, UChar* end, ScanEnv* env) { - int r, num; + int r; + OnigCodePoint code; OnigCodePoint c; OnigEncoding enc = env->enc; OnigSyntaxType* syn = env->syntax; @@ -4975,7 +4958,7 @@ fetch_token(PToken* tok, UChar** src, UChar* end, ScanEnv* env) tok->backp = p; PFETCH(c); - tok->u.c = c; + tok->u.code = c; tok->escaped = 1; switch (c) { case '*': @@ -5026,7 +5009,7 @@ fetch_token(PToken* tok, UChar** src, UChar* end, ScanEnv* env) case '{': if (! IS_SYNTAX_OP(syn, ONIG_SYN_OP_ESC_BRACE_INTERVAL)) break; - r = fetch_interval_quantifier(&p, end, tok, env); + r = fetch_interval(&p, end, tok, env); if (r < 0) return r; /* error */ if (r == 0) goto greedy_check2; else if (r == 2) { /* {n} */ @@ -5214,8 +5197,8 @@ fetch_token(PToken* tok, UChar** src, UChar* end, ScanEnv* env) prev = p; if (PPEEK_IS('{') && IS_SYNTAX_OP(syn, ONIG_SYN_OP_ESC_O_BRACE_OCTAL)) { PINC; - num = scan_unsigned_octal_number(&p, end, 11, enc); - if (num < 0) return ONIGERR_TOO_BIG_WIDE_CHAR_VALUE; + r = scan_octal_number(&p, end, 0, 11, enc, &code); + if (r < 0) return r; if (!PEND) { if (IS_CODE_DIGIT_ASCII(enc, PPEEK)) return ONIGERR_TOO_LONG_WIDE_CHAR_VALUE; @@ -5224,7 +5207,7 @@ fetch_token(PToken* tok, UChar** src, UChar* end, ScanEnv* env) if ((p > prev + enclen(enc, prev)) && !PEND && PPEEK_IS('}')) { PINC; tok->type = TK_CODE_POINT; - tok->u.code = (OnigCodePoint )num; + tok->u.code = code; } else { /* can't read nothing or invalid format */ @@ -5239,13 +5222,8 @@ fetch_token(PToken* tok, UChar** src, UChar* end, ScanEnv* env) prev = p; if (PPEEK_IS('{') && IS_SYNTAX_OP(syn, ONIG_SYN_OP_ESC_X_BRACE_HEX8)) { PINC; - num = scan_unsigned_hexadecimal_number(&p, end, 0, 8, enc); - if (num < 0) { - if (num == ONIGERR_TOO_BIG_NUMBER) - return ONIGERR_TOO_BIG_WIDE_CHAR_VALUE; - else - return num; - } + r = scan_hexadecimal_number(&p, end, 0, 8, enc, &code); + if (r < 0) return r; if (!PEND) { if (IS_CODE_XDIGIT_ASCII(enc, PPEEK)) return ONIGERR_TOO_LONG_WIDE_CHAR_VALUE; @@ -5254,7 +5232,7 @@ fetch_token(PToken* tok, UChar** src, UChar* end, ScanEnv* env) if ((p > prev + enclen(enc, prev)) && !PEND && PPEEK_IS('}')) { PINC; tok->type = TK_CODE_POINT; - tok->u.code = (OnigCodePoint )num; + tok->u.code = code; } else { /* can't read nothing or invalid format */ @@ -5262,14 +5240,14 @@ fetch_token(PToken* tok, UChar** src, UChar* end, ScanEnv* env) } } else if (IS_SYNTAX_OP(syn, ONIG_SYN_OP_ESC_X_HEX2)) { - num = scan_unsigned_hexadecimal_number(&p, end, 0, 2, enc); - if (num < 0) return num; + r = scan_hexadecimal_number(&p, end, 0, 2, enc, &code); + if (r < 0) return r; if (p == prev) { /* can't read nothing. */ - num = 0; /* but, it's not error */ + code = 0; /* but, it's not error */ } - tok->type = TK_RAW_BYTE; + tok->type = TK_CRUDE_BYTE; tok->base = 16; - tok->u.c = num; + tok->u.byte = (UChar )code; } break; @@ -5278,14 +5256,14 @@ fetch_token(PToken* tok, UChar** src, UChar* end, ScanEnv* env) prev = p; if (IS_SYNTAX_OP2(syn, ONIG_SYN_OP2_ESC_U_HEX4)) { - num = scan_unsigned_hexadecimal_number(&p, end, 4, 4, enc); - if (num < 0) return num; + r = scan_hexadecimal_number(&p, end, 4, 4, enc, &code); + if (r < 0) return r; if (p == prev) { /* can't read nothing. */ - num = 0; /* but, it's not error */ + code = 0; /* but, it's not error */ } tok->type = TK_CODE_POINT; tok->base = 16; - tok->u.code = (OnigCodePoint )num; + tok->u.code = code; } break; @@ -5293,21 +5271,21 @@ fetch_token(PToken* tok, UChar** src, UChar* end, ScanEnv* env) case '5': case '6': case '7': case '8': case '9': PUNFETCH; prev = p; - num = onig_scan_unsigned_number(&p, end, enc); - if (num < 0 || num > ONIG_MAX_BACKREF_NUM) { + r = scan_number(&p, end, enc); + if (r < 0 || r > ONIG_MAX_BACKREF_NUM) { goto skip_backref; } if (IS_SYNTAX_OP(syn, ONIG_SYN_OP_DECIMAL_BACKREF) && - (num <= env->num_mem || num <= 9)) { /* This spec. from GNU regex */ + (r <= env->num_mem || r <= 9)) { /* This spec. from GNU regex */ if (IS_SYNTAX_BV(syn, ONIG_SYN_STRICT_CHECK_BACKREF)) { - if (num > env->num_mem || IS_NULL(SCANENV_MEMENV(env)[num].node)) + if (r > env->num_mem || IS_NULL(SCANENV_MEMENV(env)[r].mem_node)) return ONIGERR_INVALID_BACKREF; } tok->type = TK_BACKREF; tok->u.backref.num = 1; - tok->u.backref.ref1 = num; + tok->u.backref.ref1 = r; tok->u.backref.by_name = 0; #ifdef USE_BACKREF_WITH_LEVEL tok->u.backref.exist_level = 0; @@ -5327,14 +5305,14 @@ fetch_token(PToken* tok, UChar** src, UChar* end, ScanEnv* env) case '0': if (IS_SYNTAX_OP(syn, ONIG_SYN_OP_ESC_OCTAL3)) { prev = p; - num = scan_unsigned_octal_number(&p, end, (c == '0' ? 2:3), enc); - if (num < 0 || num >= 256) return ONIGERR_TOO_BIG_NUMBER; + r = scan_octal_number(&p, end, 0, (c == '0' ? 2:3), enc, &code); + if (r < 0 || r >= 256) return ONIGERR_TOO_BIG_NUMBER; if (p == prev) { /* can't read nothing. */ - num = 0; /* but, it's not error */ + code = 0; /* but, it's not error */ } - tok->type = TK_RAW_BYTE; + tok->type = TK_CRUDE_BYTE; tok->base = 8; - tok->u.c = num; + tok->u.byte = (UChar )code; } else if (c != '0') { PINC; @@ -5359,7 +5337,7 @@ fetch_token(PToken* tok, UChar** src, UChar* end, ScanEnv* env) if (r == 1) tok->u.backref.exist_level = 1; else tok->u.backref.exist_level = 0; #else - r = fetch_name(c, &p, end, &name_end, env, &back_num, &num_type, 1); + r = fetch_name(c, &p, end, &name_end, env, &back_num, &num_type, TRUE); #endif if (r < 0) return r; @@ -5372,7 +5350,7 @@ fetch_token(PToken* tok, UChar** src, UChar* end, ScanEnv* env) if (IS_SYNTAX_BV(syn, ONIG_SYN_STRICT_CHECK_BACKREF)) { if (back_num > env->num_mem || - IS_NULL(SCANENV_MEMENV(env)[back_num].node)) + IS_NULL(SCANENV_MEMENV(env)[back_num].mem_node)) return ONIGERR_INVALID_BACKREF; } tok->type = TK_BACKREF; @@ -5381,7 +5359,7 @@ fetch_token(PToken* tok, UChar** src, UChar* end, ScanEnv* env) tok->u.backref.ref1 = back_num; } else { - num = name_to_group_numbers(env, prev, name_end, &backs); + int num = name_to_group_numbers(env, prev, name_end, &backs); if (num <= 0) { return ONIGERR_UNDEFINED_NAME_REFERENCE; } @@ -5389,7 +5367,7 @@ fetch_token(PToken* tok, UChar** src, UChar* end, ScanEnv* env) int i; for (i = 0; i < num; i++) { if (backs[i] > env->num_mem || - IS_NULL(SCANENV_MEMENV(env)[backs[i]].node)) + IS_NULL(SCANENV_MEMENV(env)[backs[i]].mem_node)) return ONIGERR_INVALID_BACKREF; } } @@ -5422,7 +5400,7 @@ fetch_token(PToken* tok, UChar** src, UChar* end, ScanEnv* env) prev = p; r = fetch_name((OnigCodePoint )c, &p, end, &name_end, env, - &gnum, &num_type, 1); + &gnum, &num_type, TRUE); if (r < 0) return r; if (num_type != IS_NOT_NUM) { @@ -5483,10 +5461,9 @@ fetch_token(PToken* tok, UChar** src, UChar* end, ScanEnv* env) OnigCodePoint c2; PUNFETCH; - num = fetch_escaped_value(&p, end, env, &c2); - if (num < 0) return num; - /* set_raw: */ - if (tok->u.c != c2) { + r = fetch_escaped_value(&p, end, env, &c2); + if (r < 0) return r; + if (tok->u.code != c2) { tok->type = TK_CODE_POINT; tok->u.code = c2; } @@ -5498,7 +5475,7 @@ fetch_token(PToken* tok, UChar** src, UChar* end, ScanEnv* env) } } else { - tok->u.c = c; + tok->u.code = c; tok->escaped = 0; #ifdef USE_VARIABLE_META_CHARS @@ -5563,7 +5540,7 @@ fetch_token(PToken* tok, UChar** src, UChar* end, ScanEnv* env) case '{': if (! IS_SYNTAX_OP(syn, ONIG_SYN_OP_BRACE_INTERVAL)) break; - r = fetch_interval_quantifier(&p, end, tok, env); + r = fetch_interval(&p, end, tok, env); if (r < 0) return r; /* error */ if (r == 0) goto greedy_check2; else if (r == 2) { /* {n} */ @@ -5611,8 +5588,8 @@ fetch_token(PToken* tok, UChar** src, UChar* end, ScanEnv* env) { PINC; name = p; - r = fetch_name((OnigCodePoint )'(', &p, end, &name_end, env, &gnum, - &num_type, 0); + r = fetch_name((OnigCodePoint )'(', &p, end, &name_end, env, + &gnum, &num_type, FALSE); if (r < 0) return r; tok->type = TK_CALL; @@ -5644,7 +5621,7 @@ fetch_token(PToken* tok, UChar** src, UChar* end, ScanEnv* env) { name = p; r = fetch_name((OnigCodePoint )'(', &p, end, &name_end, env, - &gnum, &num_type, 1); + &gnum, &num_type, TRUE); if (r < 0) return r; if (num_type == IS_NOT_NUM) { @@ -5700,7 +5677,7 @@ fetch_token(PToken* tok, UChar** src, UChar* end, ScanEnv* env) case '[': if (! IS_SYNTAX_OP(syn, ONIG_SYN_OP_BRACKET_CC)) break; - tok->type = TK_CC_OPEN; + tok->type = TK_OPEN_CC; break; case ']': @@ -5911,6 +5888,7 @@ add_ctype_to_cc(CClassNode* cc, int ctype, int not, ScanEnv* env) int c, r; int ascii_mode; + int is_single; const OnigCodePoint *ranges; OnigCodePoint limit; OnigCodePoint sb_out; @@ -5932,6 +5910,7 @@ add_ctype_to_cc(CClassNode* cc, int ctype, int not, ScanEnv* env) } r = 0; + is_single = ONIGENC_IS_SINGLEBYTE(enc); limit = ascii_mode ? ASCII_LIMIT : SINGLE_BYTE_SIZE; switch (ctype) { @@ -5948,19 +5927,25 @@ add_ctype_to_cc(CClassNode* cc, int ctype, int not, ScanEnv* env) case ONIGENC_CTYPE_ALNUM: if (not != 0) { for (c = 0; c < (int )limit; c++) { - if (! ONIGENC_IS_CODE_CTYPE(enc, (OnigCodePoint )c, ctype)) - BITSET_SET_BIT(cc->bs, c); + if (is_single != 0 || ONIGENC_CODE_TO_MBCLEN(enc, c) == 1) { + if (! ONIGENC_IS_CODE_CTYPE(enc, (OnigCodePoint )c, ctype)) + BITSET_SET_BIT(cc->bs, c); + } } for (c = limit; c < SINGLE_BYTE_SIZE; c++) { - BITSET_SET_BIT(cc->bs, c); + if (is_single != 0 || ONIGENC_CODE_TO_MBCLEN(enc, c) == 1) + BITSET_SET_BIT(cc->bs, c); } - ADD_ALL_MULTI_BYTE_RANGE(enc, cc->mbuf); + if (is_single == 0) + ADD_ALL_MULTI_BYTE_RANGE(enc, cc->mbuf); } else { for (c = 0; c < (int )limit; c++) { - if (ONIGENC_IS_CODE_CTYPE(enc, (OnigCodePoint )c, ctype)) - BITSET_SET_BIT(cc->bs, c); + if (is_single != 0 || ONIGENC_CODE_TO_MBCLEN(enc, c) == 1) { + if (ONIGENC_IS_CODE_CTYPE(enc, (OnigCodePoint )c, ctype)) + BITSET_SET_BIT(cc->bs, c); + } } } break; @@ -5970,21 +5955,25 @@ add_ctype_to_cc(CClassNode* cc, int ctype, int not, ScanEnv* env) case ONIGENC_CTYPE_WORD: if (not != 0) { for (c = 0; c < (int )limit; c++) { - if (ONIGENC_CODE_TO_MBCLEN(enc, c) > 0 /* check invalid code point */ + /* check invalid code point */ + if ((is_single != 0 || ONIGENC_CODE_TO_MBCLEN(enc, c) == 1) && ! ONIGENC_IS_CODE_CTYPE(enc, (OnigCodePoint )c, ctype)) BITSET_SET_BIT(cc->bs, c); } for (c = limit; c < SINGLE_BYTE_SIZE; c++) { - if (ONIGENC_CODE_TO_MBCLEN(enc, c) > 0) + if (is_single != 0 || ONIGENC_CODE_TO_MBCLEN(enc, c) == 1) BITSET_SET_BIT(cc->bs, c); } + if (ascii_mode != 0 && is_single == 0) + ADD_ALL_MULTI_BYTE_RANGE(enc, cc->mbuf); } else { for (c = 0; c < (int )limit; c++) { - if (ONIGENC_IS_CODE_CTYPE(enc, (OnigCodePoint )c, ctype)) + if ((is_single != 0 || ONIGENC_CODE_TO_MBCLEN(enc, c) == 1) + && ONIGENC_IS_CODE_CTYPE(enc, (OnigCodePoint )c, ctype)) BITSET_SET_BIT(cc->bs, c); } - if (ascii_mode == 0) + if (ascii_mode == 0 && is_single == 0) ADD_ALL_MULTI_BYTE_RANGE(enc, cc->mbuf); } break; @@ -6076,10 +6065,12 @@ fetch_char_property_to_ctype(UChar** src, UChar* end, ScanEnv* env) { int r; OnigCodePoint c; - OnigEncoding enc = env->enc; - UChar *prev, *start, *p = *src; + OnigEncoding enc; + UChar *prev, *start, *p; - r = 0; + p = *src; + enc = env->enc; + r = ONIGERR_END_PATTERN_WITH_UNMATCHED_PARENTHESIS; start = prev = p; while (!PEND) { @@ -6087,18 +6078,20 @@ fetch_char_property_to_ctype(UChar** src, UChar* end, ScanEnv* env) PFETCH_S(c); if (c == '}') { r = ONIGENC_PROPERTY_NAME_TO_CTYPE(enc, start, prev); - if (r < 0) break; + if (r >= 0) { + *src = p; + } + else { + onig_scan_env_set_error_string(env, r, *src, prev); + } - *src = p; return r; } else if (c == '(' || c == ')' || c == '{' || c == '|') { - r = ONIGERR_INVALID_CHAR_PROPERTY_NAME; break; } } - onig_scan_env_set_error_string(env, r, *src, prev); return r; } @@ -6114,7 +6107,7 @@ parse_char_property(Node** np, PToken* tok, UChar** src, UChar* end, ScanEnv* en *np = node_new_cclass(); CHECK_NULL_RETURN_MEMERR(*np); cc = CCLASS_(*np); - r = add_ctype_to_cc(cc, ctype, 0, env); + r = add_ctype_to_cc(cc, ctype, FALSE, env); if (r != 0) return r; if (tok->u.prop.not != 0) NCCLASS_SET_NOT(cc); @@ -6122,67 +6115,67 @@ parse_char_property(Node** np, PToken* tok, UChar** src, UChar* end, ScanEnv* en } -enum CCSTATE { - CCS_VALUE, - CCS_RANGE, - CCS_COMPLETE, - CCS_START -}; +typedef enum { + CS_VALUE, + CS_RANGE, + CS_COMPLETE, + CS_START +} CSTATE; -enum CCVALTYPE { - CCV_SB, - CCV_CODE_POINT, - CCV_CLASS -}; +typedef enum { + CV_UNDEF, + CV_SB, + CV_MB, + CV_CPROP +} CVAL; static int -next_state_class(CClassNode* cc, OnigCodePoint* vs, enum CCVALTYPE* type, - enum CCSTATE* state, ScanEnv* env) +cc_cprop_next(CClassNode* cc, OnigCodePoint* pcode, CVAL* val, CSTATE* state, + ScanEnv* env) { int r; - if (*state == CCS_RANGE) + if (*state == CS_RANGE) return ONIGERR_CHAR_CLASS_VALUE_AT_END_OF_RANGE; - if (*state == CCS_VALUE && *type != CCV_CLASS) { - if (*type == CCV_SB) - BITSET_SET_BIT(cc->bs, (int )(*vs)); - else if (*type == CCV_CODE_POINT) { - r = add_code_range(&(cc->mbuf), env, *vs, *vs); + if (*state == CS_VALUE) { + if (*val == CV_SB) + BITSET_SET_BIT(cc->bs, (int )(*pcode)); + else if (*val == CV_MB) { + r = add_code_range(&(cc->mbuf), env, *pcode, *pcode); if (r < 0) return r; } } - *state = CCS_VALUE; - *type = CCV_CLASS; + *state = CS_VALUE; + *val = CV_CPROP; return 0; } static int -next_state_val(CClassNode* cc, OnigCodePoint *from, OnigCodePoint to, - int* from_israw, int to_israw, - enum CCVALTYPE intype, enum CCVALTYPE* type, - enum CCSTATE* state, ScanEnv* env) +cc_char_next(CClassNode* cc, OnigCodePoint *from, OnigCodePoint to, + int* from_raw, int to_raw, CVAL intype, CVAL* type, + CSTATE* state, ScanEnv* env) { int r; switch (*state) { - case CCS_VALUE: - if (*type == CCV_SB) { + case CS_VALUE: + if (*type == CV_SB) { if (*from > 0xff) return ONIGERR_INVALID_CODE_POINT_VALUE; BITSET_SET_BIT(cc->bs, (int )(*from)); } - else if (*type == CCV_CODE_POINT) { + else if (*type == CV_MB) { r = add_code_range(&(cc->mbuf), env, *from, *from); if (r < 0) return r; } break; - case CCS_RANGE: + case CS_RANGE: if (intype == *type) { - if (intype == CCV_SB) { + if (intype == CV_SB) { if (*from > 0xff || to > 0xff) return ONIGERR_INVALID_CODE_POINT_VALUE; @@ -6211,21 +6204,21 @@ next_state_val(CClassNode* cc, OnigCodePoint *from, OnigCodePoint to, if (r < 0) return r; } ccs_range_end: - *state = CCS_COMPLETE; + *state = CS_COMPLETE; break; - case CCS_COMPLETE: - case CCS_START: - *state = CCS_VALUE; + case CS_COMPLETE: + case CS_START: + *state = CS_VALUE; break; default: break; } - *from_israw = to_israw; - *from = to; - *type = intype; + *from_raw = to_raw; + *from = to; + *type = intype; return 0; } @@ -6253,27 +6246,25 @@ code_exist_check(OnigCodePoint c, UChar* from, UChar* end, int ignore_escaped, } static int -parse_char_class(Node** np, PToken* tok, UChar** src, UChar* end, ScanEnv* env) +parse_cc(Node** np, PToken* tok, UChar** src, UChar* end, ScanEnv* env) { int r, neg, len, fetched, and_start; - OnigCodePoint v, vs; + OnigCodePoint in_code, curr_code; UChar *p; Node* node; CClassNode *cc, *prev_cc; CClassNode work_cc; - - enum CCSTATE state; - enum CCVALTYPE val_type, in_type; - int val_israw, in_israw; + int curr_raw, in_raw; + CSTATE state; + CVAL in_type; + CVAL curr_type; *np = NULL_NODE; - env->parse_depth++; - if (env->parse_depth > ParseDepthLimit) - return ONIGERR_PARSE_DEPTH_LIMIT_OVER; + INC_PARSE_DEPTH(env->parse_depth); prev_cc = (CClassNode* )NULL; r = fetch_token_in_cc(tok, src, end, env); - if (r == TK_CHAR && tok->u.c == '^' && tok->escaped == 0) { + if (r == TK_CHAR && tok->u.code == (OnigCodePoint )'^' && tok->escaped == 0) { neg = 1; r = fetch_token_in_cc(tok, src, end, env); } @@ -6296,31 +6287,27 @@ parse_char_class(Node** np, PToken* tok, UChar** src, UChar* end, ScanEnv* env) cc = CCLASS_(node); and_start = 0; - state = CCS_START; + state = CS_START; + curr_type = CV_UNDEF; + p = *src; while (r != TK_CC_CLOSE) { fetched = 0; switch (r) { case TK_CHAR: any_char_in: - len = ONIGENC_CODE_TO_MBCLEN(env->enc, tok->u.c); - if (len > 1) { - in_type = CCV_CODE_POINT; - } - else if (len < 0) { + len = ONIGENC_CODE_TO_MBCLEN(env->enc, tok->u.code); + if (len < 0) { r = len; goto err; } - else { - /* sb_char: */ - in_type = CCV_SB; - } - v = (OnigCodePoint )tok->u.c; - in_israw = 0; + in_type = (len == 1) ? CV_SB : CV_MB; + in_code = tok->u.code; + in_raw = 0; goto val_entry2; break; - case TK_RAW_BYTE: + case TK_CRUDE_BYTE: /* tok->base != 0 : octal or hexadec. */ if (! ONIGENC_IS_SINGLEBYTE(env->enc) && tok->base != 0) { int i, j; @@ -6329,15 +6316,15 @@ parse_char_class(Node** np, PToken* tok, UChar** src, UChar* end, ScanEnv* env) UChar* psave = p; int base = tok->base; - buf[0] = tok->u.c; + buf[0] = tok->u.byte; for (i = 1; i < ONIGENC_MBC_MAXLEN(env->enc); i++) { r = fetch_token_in_cc(tok, &p, end, env); if (r < 0) goto err; - if (r != TK_RAW_BYTE || tok->base != base) { + if (r != TK_CRUDE_BYTE || tok->base != base) { fetched = 1; break; } - buf[i] = tok->u.c; + buf[i] = tok->u.byte; } if (i < ONIGENC_MBC_MINLEN(env->enc)) { @@ -6362,63 +6349,63 @@ parse_char_class(Node** np, PToken* tok, UChar** src, UChar* end, ScanEnv* env) } if (i == 1) { - v = (OnigCodePoint )buf[0]; - goto raw_single; + in_code = (OnigCodePoint )buf[0]; + goto crude_single; } else { - v = ONIGENC_MBC_TO_CODE(env->enc, buf, bufe); - in_type = CCV_CODE_POINT; + in_code = ONIGENC_MBC_TO_CODE(env->enc, buf, bufe); + in_type = CV_MB; } } else { - v = (OnigCodePoint )tok->u.c; - raw_single: - in_type = CCV_SB; + in_code = (OnigCodePoint )tok->u.byte; + crude_single: + in_type = CV_SB; } - in_israw = 1; + in_raw = 1; goto val_entry2; break; case TK_CODE_POINT: - v = tok->u.code; - in_israw = 1; + in_code = tok->u.code; + in_raw = 1; val_entry: - len = ONIGENC_CODE_TO_MBCLEN(env->enc, v); + len = ONIGENC_CODE_TO_MBCLEN(env->enc, in_code); if (len < 0) { - if (state != CCS_RANGE || + if (state != CS_RANGE || ! IS_SYNTAX_BV(env->syntax, ONIG_SYN_ALLOW_INVALID_CODE_END_OF_RANGE_IN_CC) || - v < 0x100 || ONIGENC_MBC_MAXLEN(env->enc) == 1) { + in_code < 0x100 || ONIGENC_MBC_MAXLEN(env->enc) == 1) { r = len; goto err; } } - in_type = (len == 1 ? CCV_SB : CCV_CODE_POINT); + in_type = (len == 1 ? CV_SB : CV_MB); val_entry2: - r = next_state_val(cc, &vs, v, &val_israw, in_israw, in_type, &val_type, - &state, env); + r = cc_char_next(cc, &curr_code, in_code, &curr_raw, in_raw, in_type, + &curr_type, &state, env); if (r != 0) goto err; break; - case TK_POSIX_BRACKET_OPEN: + case TK_CC_POSIX_BRACKET_OPEN: r = parse_posix_bracket(cc, &p, end, env); if (r < 0) goto err; if (r == 1) { /* is not POSIX bracket */ CC_ESC_WARN(env, (UChar* )"["); p = tok->backp; - v = (OnigCodePoint )tok->u.c; - in_israw = 0; + in_code = tok->u.code; + in_raw = 0; goto val_entry; } - goto next_class; + goto next_cprop; break; case TK_CHAR_TYPE: r = add_ctype_to_cc(cc, tok->u.prop.ctype, tok->u.prop.not, env); if (r != 0) goto err; - next_class: - r = next_state_class(cc, &vs, &val_type, &state, env); + next_cprop: + r = cc_cprop_next(cc, &curr_code, &curr_type, &state, env); if (r != 0) goto err; break; @@ -6431,19 +6418,20 @@ parse_char_class(Node** np, PToken* tok, UChar** src, UChar* end, ScanEnv* env) } r = add_ctype_to_cc(cc, ctype, tok->u.prop.not, env); if (r != 0) goto err; - goto next_class; + goto next_cprop; } break; case TK_CC_RANGE: - if (state == CCS_VALUE) { + if (state == CS_VALUE) { r = fetch_token_in_cc(tok, &p, end, env); if (r < 0) goto err; + fetched = 1; if (r == TK_CC_CLOSE) { /* allow [x-] */ range_end_val: - v = (OnigCodePoint )'-'; - in_israw = 0; + in_code = (OnigCodePoint )'-'; + in_raw = 0; goto val_entry; } else if (r == TK_CC_AND) { @@ -6451,20 +6439,21 @@ parse_char_class(Node** np, PToken* tok, UChar** src, UChar* end, ScanEnv* env) goto range_end_val; } - if (val_type == CCV_CLASS) { + if (curr_type == CV_CPROP) { r = ONIGERR_UNMATCHED_RANGE_SPECIFIER_IN_CHAR_CLASS; goto err; } - state = CCS_RANGE; + state = CS_RANGE; } - else if (state == CCS_START) { + else if (state == CS_START) { /* [-xa] is allowed */ - v = (OnigCodePoint )tok->u.c; - in_israw = 0; + in_code = tok->u.code; + in_raw = 0; r = fetch_token_in_cc(tok, &p, end, env); if (r < 0) goto err; + fetched = 1; /* [--x] or [a&&-x] is warned. */ if (r == TK_CC_RANGE || and_start != 0) @@ -6472,15 +6461,17 @@ parse_char_class(Node** np, PToken* tok, UChar** src, UChar* end, ScanEnv* env) goto val_entry; } - else if (state == CCS_RANGE) { + else if (state == CS_RANGE) { CC_ESC_WARN(env, (UChar* )"-"); - goto any_char_in; /* [!--x] is allowed */ + goto any_char_in; /* [!--] is allowed */ } - else { /* CCS_COMPLETE */ + else { /* CS_COMPLETE */ r = fetch_token_in_cc(tok, &p, end, env); if (r < 0) goto err; + fetched = 1; - if (r == TK_CC_CLOSE) goto range_end_val; /* allow [a-b-] */ + if (r == TK_CC_CLOSE) + goto range_end_val; /* allow [a-b-] */ else if (r == TK_CC_AND) { CC_ESC_WARN(env, (UChar* )"-"); goto range_end_val; @@ -6495,12 +6486,19 @@ parse_char_class(Node** np, PToken* tok, UChar** src, UChar* end, ScanEnv* env) } break; - case TK_CC_CC_OPEN: /* [ */ + case TK_CC_OPEN_CC: /* [ */ { Node *anode; CClassNode* acc; - r = parse_char_class(&anode, tok, &p, end, env); + if (state == CS_VALUE) { + r = cc_char_next(cc, &curr_code, 0, &curr_raw, 0, curr_type, &curr_type, + &state, env); + if (r != 0) goto err; + } + state = CS_COMPLETE; + + r = parse_cc(&anode, tok, &p, end, env); if (r != 0) { onig_node_free(anode); goto cc_open_err; @@ -6516,14 +6514,14 @@ parse_char_class(Node** np, PToken* tok, UChar** src, UChar* end, ScanEnv* env) case TK_CC_AND: /* && */ { - if (state == CCS_VALUE) { - r = next_state_val(cc, &vs, 0, &val_israw, 0, val_type, - &val_type, &state, env); + if (state == CS_VALUE) { + r = cc_char_next(cc, &curr_code, 0, &curr_raw, 0, curr_type, &curr_type, + &state, env); if (r != 0) goto err; } /* initialize local variables */ and_start = 1; - state = CCS_START; + state = CS_START; if (IS_NOT_NULL(prev_cc)) { r = and_cclass(prev_cc, cc, env->enc); @@ -6556,9 +6554,9 @@ parse_char_class(Node** np, PToken* tok, UChar** src, UChar* end, ScanEnv* env) } } - if (state == CCS_VALUE) { - r = next_state_val(cc, &vs, 0, &val_israw, 0, val_type, - &val_type, &state, env); + if (state == CS_VALUE) { + r = cc_char_next(cc, &curr_code, 0, &curr_raw, 0, curr_type, &curr_type, + &state, env); if (r != 0) goto err; } @@ -6591,7 +6589,7 @@ parse_char_class(Node** np, PToken* tok, UChar** src, UChar* end, ScanEnv* env) } } *src = p; - env->parse_depth--; + DEC_PARSE_DEPTH(env->parse_depth); return 0; err: @@ -6600,8 +6598,8 @@ parse_char_class(Node** np, PToken* tok, UChar** src, UChar* end, ScanEnv* env) return r; } -static int parse_subexp(Node** top, PToken* tok, int term, - UChar** src, UChar* end, ScanEnv* env, int group_head); +static int parse_alts(Node** top, PToken* tok, int term, + UChar** src, UChar* end, ScanEnv* env, int group_head); #ifdef USE_CALLOUT @@ -6772,7 +6770,8 @@ parse_long(OnigEncoding enc, UChar* s, UChar* end, int sign_on, long max, long* static int parse_callout_args(int skip_mode, int cterm, UChar** src, UChar* end, - unsigned int types[], OnigValue vals[], ScanEnv* env) + int max_arg_num, unsigned int types[], OnigValue vals[], + ScanEnv* env) { #define MAX_CALLOUT_ARG_BYTE_LENGTH 128 @@ -6791,9 +6790,9 @@ parse_callout_args(int skip_mode, int cterm, UChar** src, UChar* end, if (PEND) return ONIGERR_INVALID_CALLOUT_PATTERN; + c = 0; n = 0; while (n < ONIG_CALLOUT_MAX_ARGS_NUM) { - c = 0; cn = 0; esc = 0; eesc = 0; @@ -6826,7 +6825,7 @@ parse_callout_args(int skip_mode, int cterm, UChar** src, UChar* end, size_t clen; add_char: - if (skip_mode == 0) { + if (skip_mode == FALSE) { clen = p - e; if (bufend + clen > buf + MAX_CALLOUT_ARG_BYTE_LENGTH) return ONIGERR_INVALID_CALLOUT_ARG; /* too long argument */ @@ -6840,7 +6839,10 @@ parse_callout_args(int skip_mode, int cterm, UChar** src, UChar* end, } if (cn != 0) { - if (skip_mode == 0) { + if (max_arg_num >= 0 && n >= max_arg_num) + return ONIGERR_INVALID_CALLOUT_ARG; + + if (skip_mode == FALSE) { if ((types[n] & ONIG_TYPE_LONG) != 0) { int fixed = 0; if (cn > 0) { @@ -6972,7 +6974,7 @@ parse_callout_of_name(Node** np, int cterm, UChar** src, UChar* end, ScanEnv* en /* read for single check only */ save = p; - arg_num = parse_callout_args(1, '}', &p, end, 0, 0, env); + arg_num = parse_callout_args(TRUE, '}', &p, end, -1, NULL, NULL, env); if (arg_num < 0) return arg_num; is_not_single = PPEEK_IS(cterm) ? 0 : 1; @@ -6986,7 +6988,7 @@ parse_callout_of_name(Node** np, int cterm, UChar** src, UChar* end, ScanEnv* en types[i] = get_callout_arg_type_by_name_id(name_id, i); } - arg_num = parse_callout_args(0, '}', &p, end, types, vals, env); + arg_num = parse_callout_args(FALSE, '}', &p, end, max_arg_num, types, vals, env); if (arg_num < 0) return arg_num; if (PEND) return ONIGERR_END_PATTERN_IN_GROUP; @@ -7086,17 +7088,17 @@ parse_bag(Node** np, PToken* tok, int term, UChar** src, UChar* end, group: r = fetch_token(tok, &p, end, env); if (r < 0) return r; - r = parse_subexp(np, tok, term, &p, end, env, 0); + r = parse_alts(np, tok, term, &p, end, env, FALSE); if (r < 0) return r; *src = p; return 1; /* group */ break; case '=': - *np = onig_node_new_anchor(ANCR_PREC_READ, 0); + *np = onig_node_new_anchor(ANCR_PREC_READ, FALSE); break; case '!': /* preceding read */ - *np = onig_node_new_anchor(ANCR_PREC_READ_NOT, 0); + *np = onig_node_new_anchor(ANCR_PREC_READ_NOT, FALSE); break; case '>': /* (?>...) stop backtrack */ *np = node_new_bag(BAG_STOP_BACKTRACK); @@ -7114,9 +7116,9 @@ parse_bag(Node** np, PToken* tok, int term, UChar** src, UChar* end, if (PEND) return ONIGERR_END_PATTERN_WITH_UNMATCHED_PARENTHESIS; PFETCH(c); if (c == '=') - *np = onig_node_new_anchor(ANCR_LOOK_BEHIND, 0); + *np = onig_node_new_anchor(ANCR_LOOK_BEHIND, FALSE); else if (c == '!') - *np = onig_node_new_anchor(ANCR_LOOK_BEHIND_NOT, 0); + *np = onig_node_new_anchor(ANCR_LOOK_BEHIND_NOT, FALSE); else { if (IS_SYNTAX_OP2(env->syntax, ONIG_SYN_OP2_QMARK_LT_NAMED_GROUP)) { UChar *name; @@ -7132,7 +7134,7 @@ parse_bag(Node** np, PToken* tok, int term, UChar** src, UChar* end, named_group2: name = p; r = fetch_name((OnigCodePoint )c, &p, end, &name_end, env, &num, - &num_type, 0); + &num_type, FALSE); if (r < 0) return r; num = scan_env_add_mem_entry(env); @@ -7146,7 +7148,7 @@ parse_bag(Node** np, PToken* tok, int term, UChar** src, UChar* end, CHECK_NULL_RETURN_MEMERR(*np); BAG_(*np)->m.regnum = num; if (list_capture != 0) - MEM_STATUS_ON_SIMPLE(env->capture_history, num); + MEM_STATUS_ON_SIMPLE(env->cap_history, num); env->num_named++; } else { @@ -7181,7 +7183,7 @@ parse_bag(Node** np, PToken* tok, int term, UChar** src, UChar* end, r = fetch_token(tok, &p, end, env); if (r < 0) return r; - r = parse_subexp(&absent, tok, term, &p, end, env, 1); + r = parse_alts(&absent, tok, term, &p, end, env, TRUE); if (r < 0) { onig_node_free(absent); return r; @@ -7268,7 +7270,7 @@ parse_bag(Node** np, PToken* tok, int term, UChar** src, UChar* end, if (r == 1) exist_level = 1; #else r = fetch_name((OnigCodePoint )(is_enclosed != 0 ? c : '('), - &p, end, &name_end, env, &back_num, &num_type, 1); + &p, end, &name_end, env, &back_num, &num_type, TRUE); #endif if (r < 0) { if (is_enclosed == 0) { @@ -7288,11 +7290,11 @@ parse_bag(Node** np, PToken* tok, int term, UChar** src, UChar* end, if (IS_SYNTAX_BV(env->syntax, ONIG_SYN_STRICT_CHECK_BACKREF)) { if (back_num > env->num_mem || - IS_NULL(SCANENV_MEMENV(env)[back_num].node)) + IS_NULL(SCANENV_MEMENV(env)[back_num].mem_node)) return ONIGERR_INVALID_BACKREF; } - condition = node_new_backref_checker(1, &back_num, 0, + condition = node_new_backref_checker(1, &back_num, FALSE, #ifdef USE_BACKREF_WITH_LEVEL exist_level, level, #endif @@ -7310,12 +7312,12 @@ parse_bag(Node** np, PToken* tok, int term, UChar** src, UChar* end, int i; for (i = 0; i < num; i++) { if (backs[i] > env->num_mem || - IS_NULL(SCANENV_MEMENV(env)[backs[i]].node)) + IS_NULL(SCANENV_MEMENV(env)[backs[i]].mem_node)) return ONIGERR_INVALID_BACKREF; } } - condition = node_new_backref_checker(num, backs, 1, + condition = node_new_backref_checker(num, backs, TRUE, #ifdef USE_BACKREF_WITH_LEVEL exist_level, level, #endif @@ -7357,7 +7359,7 @@ parse_bag(Node** np, PToken* tok, int term, UChar** src, UChar* end, condition_is_checker = 0; r = fetch_token(tok, &p, end, env); if (r < 0) return r; - r = parse_subexp(&condition, tok, term, &p, end, env, 0); + r = parse_alts(&condition, tok, term, &p, end, env, FALSE); if (r < 0) { onig_node_free(condition); return r; @@ -7400,7 +7402,7 @@ parse_bag(Node** np, PToken* tok, int term, UChar** src, UChar* end, onig_node_free(condition); return r; } - r = parse_subexp(&target, tok, term, &p, end, env, 1); + r = parse_alts(&target, tok, term, &p, end, env, TRUE); if (r < 0) { onig_node_free(condition); onig_node_free(target); @@ -7465,7 +7467,7 @@ parse_bag(Node** np, PToken* tok, int term, UChar** src, UChar* end, return ONIGERR_GROUP_NUMBER_OVER_FOR_CAPTURE_HISTORY; } BAG_(*np)->m.regnum = num; - MEM_STATUS_ON_SIMPLE(env->capture_history, num); + MEM_STATUS_ON_SIMPLE(env->cap_history, num); } else { return ONIGERR_UNDEFINED_GROUP_OPTION; @@ -7501,7 +7503,7 @@ parse_bag(Node** np, PToken* tok, int term, UChar** src, UChar* end, case 'm': if (IS_SYNTAX_OP2(env->syntax, ONIG_SYN_OP2_OPTION_PERL)) { - OPTION_NEGATE(option, ONIG_OPTION_SINGLELINE, (neg == 0 ? 1 : 0)); + OPTION_NEGATE(option, ONIG_OPTION_SINGLELINE, (neg == 0 ? TRUE : FALSE)); } else if (IS_SYNTAX_OP2(env->syntax, ONIG_SYN_OP2_OPTION_ONIGURUMA|ONIG_SYN_OP2_OPTION_RUBY)) { @@ -7537,16 +7539,16 @@ parse_bag(Node** np, PToken* tok, int term, UChar** src, UChar* end, if (! ONIGENC_IS_UNICODE_ENCODING(enc)) return ONIGERR_UNDEFINED_GROUP_OPTION; - OPTION_NEGATE(option, ONIG_OPTION_TEXT_SEGMENT_EXTENDED_GRAPHEME_CLUSTER, 0); - OPTION_NEGATE(option, ONIG_OPTION_TEXT_SEGMENT_WORD, 1); + OPTION_NEGATE(option, ONIG_OPTION_TEXT_SEGMENT_EXTENDED_GRAPHEME_CLUSTER, FALSE); + OPTION_NEGATE(option, ONIG_OPTION_TEXT_SEGMENT_WORD, TRUE); break; #ifdef USE_UNICODE_WORD_BREAK case 'w': if (! ONIGENC_IS_UNICODE_ENCODING(enc)) return ONIGERR_UNDEFINED_GROUP_OPTION; - OPTION_NEGATE(option, ONIG_OPTION_TEXT_SEGMENT_WORD, 0); - OPTION_NEGATE(option, ONIG_OPTION_TEXT_SEGMENT_EXTENDED_GRAPHEME_CLUSTER, 1); + OPTION_NEGATE(option, ONIG_OPTION_TEXT_SEGMENT_WORD, FALSE); + OPTION_NEGATE(option, ONIG_OPTION_TEXT_SEGMENT_EXTENDED_GRAPHEME_CLUSTER, TRUE); break; #endif default: @@ -7576,7 +7578,7 @@ parse_bag(Node** np, PToken* tok, int term, UChar** src, UChar* end, env->options = option; r = fetch_token(tok, &p, end, env); if (r < 0) return r; - r = parse_subexp(&target, tok, term, &p, end, env, 0); + r = parse_alts(&target, tok, term, &p, end, env, FALSE); env->options = prev; if (r < 0) { onig_node_free(target); @@ -7623,7 +7625,7 @@ parse_bag(Node** np, PToken* tok, int term, UChar** src, UChar* end, CHECK_NULL_RETURN_MEMERR(*np); r = fetch_token(tok, &p, end, env); if (r < 0) return r; - r = parse_subexp(&target, tok, term, &p, end, env, 0); + r = parse_alts(&target, tok, term, &p, end, env, FALSE); if (r < 0) { onig_node_free(target); return r; @@ -7633,7 +7635,7 @@ parse_bag(Node** np, PToken* tok, int term, UChar** src, UChar* end, if (NODE_TYPE(*np) == NODE_BAG) { if (BAG_(*np)->type == BAG_MEMORY) { - /* Don't move this to previous of parse_subexp() */ + /* Don't move this to previous of parse_alts() */ r = scan_env_set_mem_node(env, BAG_(*np)->m.regnum, *np); if (r != 0) return r; } @@ -7653,7 +7655,7 @@ static const char* ReduceQStr[] = { }; static int -set_quantifier(Node* qnode, Node* target, int group, ScanEnv* env) +assign_quantifier_body(Node* qnode, Node* target, int group, ScanEnv* env) { QuantNode* qn; @@ -7725,9 +7727,11 @@ set_quantifier(Node* qnode, Node* target, int group, ScanEnv* env) } } else { + int r; + NODE_BODY(qnode) = target; - onig_reduce_nested_quantifier(qnode, target); - goto q_exit; + r = onig_reduce_nested_quantifier(qnode); + return r; } } break; @@ -7737,7 +7741,6 @@ set_quantifier(Node* qnode, Node* target, int group, ScanEnv* env) } NODE_BODY(qnode) = target; - q_exit: return 0; } @@ -7767,6 +7770,38 @@ clear_not_flag_cclass(CClassNode* cc, OnigEncoding enc) } #endif /* CASE_FOLD_IS_APPLIED_INSIDE_NEGATIVE_CCLASS */ +#define ADD_CODE_INTO_CC(cc, code, enc) do {\ + if (ONIGENC_MBC_MINLEN(enc) > 1 || ONIGENC_CODE_TO_MBCLEN(enc, code) != 1) {\ + add_code_range_to_buf(&((cc)->mbuf), code, code);\ + }\ + else {\ + BITSET_SET_BIT((cc)->bs, code);\ + }\ +} while (0) + +extern int +onig_new_cclass_with_code_list(Node** rnode, OnigEncoding enc, + int n, OnigCodePoint codes[]) +{ + int i; + Node* node; + CClassNode* cc; + + *rnode = NULL_NODE; + + node = node_new_cclass(); + CHECK_NULL_RETURN_MEMERR(node); + + cc = CCLASS_(node); + + for (i = 0; i < n; i++) { + ADD_CODE_INTO_CC(cc, codes[i], enc); + } + + *rnode = node; + return 0; +} + typedef struct { ScanEnv* env; CClassNode* cc; @@ -7780,37 +7815,31 @@ i_apply_case_fold(OnigCodePoint from, OnigCodePoint to[], int to_len, void* arg) IApplyCaseFoldArg* iarg; ScanEnv* env; CClassNode* cc; - BitSetRef bs; iarg = (IApplyCaseFoldArg* )arg; env = iarg->env; cc = iarg->cc; - bs = cc->bs; if (to_len == 1) { int is_in = onig_is_code_in_cc(env->enc, from, cc); #ifdef CASE_FOLD_IS_APPLIED_INSIDE_NEGATIVE_CCLASS if ((is_in != 0 && !IS_NCCLASS_NOT(cc)) || (is_in == 0 && IS_NCCLASS_NOT(cc))) { - if (ONIGENC_MBC_MINLEN(env->enc) > 1 || *to >= SINGLE_BYTE_SIZE) { - add_code_range(&(cc->mbuf), env, *to, *to); - } - else { - BITSET_SET_BIT(bs, *to); - } + ADD_CODE_INTO_CC(cc, *to, env->enc); } #else if (is_in != 0) { - if (ONIGENC_MBC_MINLEN(env->enc) > 1 || *to >= SINGLE_BYTE_SIZE) { + if (ONIGENC_MBC_MINLEN(env->enc) > 1 || + ONIGENC_CODE_TO_MBCLEN(env->enc, *to) != 1) { if (IS_NCCLASS_NOT(cc)) clear_not_flag_cclass(cc, env->enc); add_code_range(&(cc->mbuf), env, *to, *to); } else { if (IS_NCCLASS_NOT(cc)) { - BITSET_CLEAR_BIT(bs, *to); + BITSET_CLEAR_BIT(cc->bs, *to); } else - BITSET_SET_BIT(bs, *to); + BITSET_SET_BIT(cc->bs, *to); } } #endif /* CASE_FOLD_IS_APPLIED_INSIDE_NEGATIVE_CCLASS */ @@ -7818,34 +7847,65 @@ i_apply_case_fold(OnigCodePoint from, OnigCodePoint to[], int to_len, void* arg) else { int r, i, len; UChar buf[ONIGENC_CODE_TO_MBC_MAXLEN]; - Node *snode = NULL_NODE; if (onig_is_code_in_cc(env->enc, from, cc) #ifdef CASE_FOLD_IS_APPLIED_INSIDE_NEGATIVE_CCLASS && !IS_NCCLASS_NOT(cc) #endif ) { + int n, j, m, index; + Node* list_node; + Node* ns[3]; + + n = 0; for (i = 0; i < to_len; i++) { - len = ONIGENC_CODE_TO_MBC(env->enc, to[i], buf); - if (i == 0) { - snode = onig_node_new_str(buf, buf + len); - CHECK_NULL_RETURN_MEMERR(snode); - - /* char-class expanded multi-char only - compare with string folded at match time. */ - NODE_STRING_SET_AMBIG(snode); + OnigCodePoint code; + Node* csnode; + CClassNode* cs_cc; + + index = onigenc_unicode_fold1_key(&to[i]); + if (index >= 0) { + csnode = node_new_cclass(); + cs_cc = CCLASS_(csnode); + if (IS_NULL(csnode)) { + err_free_ns: + for (j = 0; j < n; j++) onig_node_free(ns[j]); + return ONIGERR_MEMORY; + } + m = FOLDS1_UNFOLDS_NUM(index); + for (j = 0; j < m; j++) { + code = FOLDS1_UNFOLDS(index)[j]; + ADD_CODE_INTO_CC(cs_cc, code, env->enc); + } + ADD_CODE_INTO_CC(cs_cc, to[i], env->enc); + ns[n++] = csnode; } else { - r = onig_node_str_cat(snode, buf, buf + len); - if (r < 0) { - onig_node_free(snode); - return r; + len = ONIGENC_CODE_TO_MBC(env->enc, to[i], buf); + if (n == 0 || NODE_TYPE(ns[n-1]) != NODE_STRING) { + csnode = onig_node_new_str(buf, buf + len); + if (IS_NULL(csnode)) goto err_free_ns; + + NODE_STRING_SET_CASE_EXPANDED(csnode); + ns[n++] = csnode; + } + else { + r = onig_node_str_cat(ns[n-1], buf, buf + len); + if (r < 0) goto err_free_ns; } } } - *(iarg->ptail) = onig_node_new_alt(snode, NULL_NODE); - CHECK_NULL_RETURN_MEMERR(*(iarg->ptail)); + if (n == 1) + list_node = ns[0]; + else + list_node = make_list(n, ns); + + *(iarg->ptail) = onig_node_new_alt(list_node, NULL_NODE); + if (IS_NULL(*(iarg->ptail))) { + onig_node_free(list_node); + return ONIGERR_MEMORY; + } iarg->ptail = &(NODE_CDR((*(iarg->ptail)))); } } @@ -7901,7 +7961,7 @@ parse_exp(Node** np, PToken* tok, int term, UChar** src, UChar* end, env->options = BAG_(*np)->o.options; r = fetch_token(tok, src, end, env); if (r < 0) return r; - r = parse_subexp(&target, tok, term, src, end, env, 0); + r = parse_alts(&target, tok, term, src, end, env, FALSE); env->options = prev; if (r < 0) { onig_node_free(target); @@ -7916,7 +7976,7 @@ parse_exp(Node** np, PToken* tok, int term, UChar** src, UChar* end, if (! IS_SYNTAX_BV(env->syntax, ONIG_SYN_ALLOW_UNMATCHED_CLOSE_SUBEXP)) return ONIGERR_UNMATCHED_CLOSE_PARENTHESIS; - if (tok->escaped) goto tk_raw_byte; + if (tok->escaped) goto tk_crude_byte; else goto tk_byte; break; @@ -7941,36 +8001,36 @@ parse_exp(Node** np, PToken* tok, int term, UChar** src, UChar* end, } break; - case TK_RAW_BYTE: - tk_raw_byte: + case TK_CRUDE_BYTE: + tk_crude_byte: { - *np = node_new_str_raw_char((UChar )tok->u.c); + *np = node_new_str_crude_char(tok->u.byte); CHECK_NULL_RETURN_MEMERR(*np); len = 1; while (1) { if (len >= ONIGENC_MBC_MINLEN(env->enc)) { if (len == enclen(env->enc, STR_(*np)->s)) { r = fetch_token(tok, src, end, env); - goto tk_raw_byte_end; + goto tk_crude_byte_end; } } r = fetch_token(tok, src, end, env); if (r < 0) return r; - if (r != TK_RAW_BYTE) + if (r != TK_CRUDE_BYTE) return ONIGERR_TOO_SHORT_MULTI_BYTE_STRING; - r = node_str_cat_char(*np, (UChar )tok->u.c); + r = node_str_cat_char(*np, tok->u.byte); if (r < 0) return r; len++; } - tk_raw_byte_end: + tk_crude_byte_end: if (! ONIGENC_IS_VALID_MBC_STRING(env->enc, STR_(*np)->s, STR_(*np)->end)) return ONIGERR_INVALID_WIDE_CHAR_VALUE; - NODE_STRING_CLEAR_RAW(*np); + NODE_STRING_CLEAR_CRUDE(*np); goto string_end; } break; @@ -7981,7 +8041,7 @@ parse_exp(Node** np, PToken* tok, int term, UChar** src, UChar* end, len = ONIGENC_CODE_TO_MBC(env->enc, tok->u.code, buf); if (len < 0) return len; #ifdef NUMBERED_CHAR_IS_NOT_CASE_AMBIG - *np = node_new_str_raw(buf, buf + len); + *np = node_new_str_crude(buf, buf + len); #else *np = node_new_str(buf, buf + len); #endif @@ -8024,7 +8084,7 @@ parse_exp(Node** np, PToken* tok, int term, UChar** src, UChar* end, *np = node_new_cclass(); CHECK_NULL_RETURN_MEMERR(*np); cc = CCLASS_(*np); - add_ctype_to_cc(cc, tok->u.prop.ctype, 0, env); + add_ctype_to_cc(cc, tok->u.prop.ctype, FALSE, env); if (tok->u.prop.not != 0) NCCLASS_SET_NOT(cc); } break; @@ -8041,11 +8101,11 @@ parse_exp(Node** np, PToken* tok, int term, UChar** src, UChar* end, if (r != 0) return r; break; - case TK_CC_OPEN: + case TK_OPEN_CC: { CClassNode* cc; - r = parse_char_class(np, tok, src, end, env); + r = parse_cc(np, tok, src, end, env); if (r != 0) return r; cc = CCLASS_(*np); @@ -8083,7 +8143,7 @@ parse_exp(Node** np, PToken* tok, int term, UChar** src, UChar* end, case TK_ANYCHAR_ANYTIME: *np = node_new_anychar(); CHECK_NULL_RETURN_MEMERR(*np); - qn = node_new_quantifier(0, INFINITE_REPEAT, 0); + qn = node_new_quantifier(0, INFINITE_REPEAT, FALSE); CHECK_NULL_RETURN_MEMERR(qn); NODE_BODY(qn) = *np; *np = qn; @@ -8186,9 +8246,7 @@ parse_exp(Node** np, PToken* tok, int term, UChar** src, UChar* end, if (is_invalid_quantifier_target(*tp)) return ONIGERR_TARGET_OF_REPEAT_OPERATOR_INVALID; - parse_depth++; - if (parse_depth > ParseDepthLimit) - return ONIGERR_PARSE_DEPTH_LIMIT_OVER; + INC_PARSE_DEPTH(parse_depth); qn = node_new_quantifier(tok->u.repeat.lower, tok->u.repeat.upper, r == TK_INTERVAL); @@ -8201,9 +8259,10 @@ parse_exp(Node** np, PToken* tok, int term, UChar** src, UChar* end, else { target = *tp; } - r = set_quantifier(qn, target, group, env); + r = assign_quantifier_body(qn, target, group, env); if (r < 0) { onig_node_free(qn); + *tp = NULL_NODE; return r; } @@ -8256,6 +8315,8 @@ parse_branch(Node** top, PToken* tok, int term, UChar** src, UChar* end, Node *node, **headp; *top = NULL; + INC_PARSE_DEPTH(env->parse_depth); + r = parse_exp(&node, tok, term, src, end, env, group_head); if (r < 0) { onig_node_free(node); @@ -8266,7 +8327,7 @@ parse_branch(Node** top, PToken* tok, int term, UChar** src, UChar* end, *top = node; } else { - *top = node_new_list(node, NULL); + *top = node_new_list(node, NULL); if (IS_NULL(*top)) { onig_node_free(node); return ONIGERR_MEMORY; @@ -8274,7 +8335,7 @@ parse_branch(Node** top, PToken* tok, int term, UChar** src, UChar* end, headp = &(NODE_CDR(*top)); while (r != TK_EOT && r != term && r != TK_ALT) { - r = parse_exp(&node, tok, term, src, end, env, 0); + r = parse_exp(&node, tok, term, src, end, env, FALSE); if (r < 0) { onig_node_free(node); return r; @@ -8292,21 +8353,20 @@ parse_branch(Node** top, PToken* tok, int term, UChar** src, UChar* end, } } + DEC_PARSE_DEPTH(env->parse_depth); return r; } /* term_tok: TK_EOT or TK_SUBEXP_CLOSE */ static int -parse_subexp(Node** top, PToken* tok, int term, UChar** src, UChar* end, - ScanEnv* env, int group_head) +parse_alts(Node** top, PToken* tok, int term, UChar** src, UChar* end, + ScanEnv* env, int group_head) { int r; Node *node, **headp; *top = NULL; - env->parse_depth++; - if (env->parse_depth > ParseDepthLimit) - return ONIGERR_PARSE_DEPTH_LIMIT_OVER; + INC_PARSE_DEPTH(env->parse_depth); r = parse_branch(&node, tok, term, src, end, env, group_head); if (r < 0) { @@ -8328,7 +8388,7 @@ parse_subexp(Node** top, PToken* tok, int term, UChar** src, UChar* end, while (r == TK_ALT) { r = fetch_token(tok, src, end, env); if (r < 0) return r; - r = parse_branch(&node, tok, term, src, end, env, 0); + r = parse_branch(&node, tok, term, src, end, env, FALSE); if (r < 0) { onig_node_free(node); return r; @@ -8355,7 +8415,7 @@ parse_subexp(Node** top, PToken* tok, int term, UChar** src, UChar* end, return ONIGERR_PARSER_BUG; } - env->parse_depth--; + DEC_PARSE_DEPTH(env->parse_depth); return r; } @@ -8367,7 +8427,7 @@ parse_regexp(Node** top, UChar** src, UChar* end, ScanEnv* env) r = fetch_token(&tok, src, end, env); if (r < 0) return r; - r = parse_subexp(top, &tok, TK_EOT, src, end, env, 0); + r = parse_alts(top, &tok, TK_EOT, src, end, env, FALSE); if (r < 0) return r; return 0; diff --git a/src/regparse.h b/src/regparse.h index 231f7b5..1525ccb 100644 --- a/src/regparse.h +++ b/src/regparse.h @@ -4,7 +4,7 @@ regparse.h - Oniguruma (regular expression library) **********************************************************************/ /*- - * Copyright (c) 2002-2019 K.Kosako + * Copyright (c) 2002-2019 K.Kosako * All rights reserved. * * Redistribution and use in source and binary forms, with or without @@ -32,7 +32,7 @@ #include "regint.h" #define NODE_STRING_MARGIN 16 -#define NODE_STRING_BUF_SIZE 24 /* sizeof(CClassNode) - sizeof(int)*4 */ +#define NODE_STRING_BUF_SIZE 20 /* sizeof(CClassNode) - sizeof(int)*4 */ #define NODE_BACKREFS_SIZE 6 /* node type */ @@ -73,20 +73,25 @@ enum BodyEmptyType { BODY_IS_EMPTY_POSSIBILITY_REC = 3 }; +struct _Node; + typedef struct { NodeType node_type; int status; + struct _Node* parent; UChar* s; UChar* end; unsigned int flag; - int capacity; /* (allocated size - 1) or 0: use buf[] */ UChar buf[NODE_STRING_BUF_SIZE]; + int capacity; /* (allocated size - 1) or 0: use buf[] */ + int case_min_len; } StrNode; typedef struct { NodeType node_type; int status; + struct _Node* parent; unsigned int flags; BitSet bs; @@ -96,6 +101,7 @@ typedef struct { typedef struct { NodeType node_type; int status; + struct _Node* parent; struct _Node* body; int lower; @@ -104,12 +110,13 @@ typedef struct { enum BodyEmptyType emptiness; struct _Node* head_exact; struct _Node* next_head_exact; - int is_refered; /* include called node. don't eliminate even if {0} */ + int include_referred; /* include called node. don't eliminate even if {0} */ } QuantNode; typedef struct { NodeType node_type; int status; + struct _Node* parent; struct _Node* body; enum BagType type; @@ -152,6 +159,7 @@ typedef struct { typedef struct { NodeType node_type; int status; + struct _Node* parent; struct _Node* body; /* to BagNode : BAG_MEMORY */ int by_number; @@ -166,6 +174,7 @@ typedef struct { typedef struct { NodeType node_type; int status; + struct _Node* parent; int back_num; int back_static[NODE_BACKREFS_SIZE]; @@ -176,6 +185,7 @@ typedef struct { typedef struct { NodeType node_type; int status; + struct _Node* parent; struct _Node* body; int type; @@ -186,6 +196,7 @@ typedef struct { typedef struct { NodeType node_type; int status; + struct _Node* parent; struct _Node* car; struct _Node* cdr; @@ -194,6 +205,7 @@ typedef struct { typedef struct { NodeType node_type; int status; + struct _Node* parent; int ctype; int not; @@ -204,6 +216,7 @@ typedef struct { typedef struct { NodeType node_type; int status; + struct _Node* parent; enum GimmickType type; int detail_type; @@ -216,6 +229,7 @@ typedef struct _Node { struct { NodeType node_type; int status; + struct _Node* parent; struct _Node* body; } base; @@ -280,26 +294,21 @@ typedef struct _Node { #define ANCR_ANYCHAR_INF_MASK (ANCR_ANYCHAR_INF | ANCR_ANYCHAR_INF_ML) #define ANCR_END_BUF_MASK (ANCR_END_BUF | ANCR_SEMI_END_BUF) -#define NODE_STRING_RAW (1<<0) /* by backslashed number */ -#define NODE_STRING_AMBIG (1<<1) -#define NODE_STRING_GOOD_AMBIG (1<<2) -#define NODE_STRING_DONT_GET_OPT_INFO (1<<3) +#define NODE_STRING_CRUDE (1<<0) +#define NODE_STRING_CASE_EXPANDED (1<<1) +#define NODE_STRING_CASE_FOLD_MATCH (1<<2) #define NODE_STRING_LEN(node) (int )((node)->u.str.end - (node)->u.str.s) -#define NODE_STRING_SET_RAW(node) (node)->u.str.flag |= NODE_STRING_RAW -#define NODE_STRING_CLEAR_RAW(node) (node)->u.str.flag &= ~NODE_STRING_RAW -#define NODE_STRING_SET_AMBIG(node) (node)->u.str.flag |= NODE_STRING_AMBIG -#define NODE_STRING_SET_GOOD_AMBIG(node) (node)->u.str.flag |= NODE_STRING_GOOD_AMBIG -#define NODE_STRING_SET_DONT_GET_OPT_INFO(node) \ - (node)->u.str.flag |= NODE_STRING_DONT_GET_OPT_INFO -#define NODE_STRING_IS_RAW(node) \ - (((node)->u.str.flag & NODE_STRING_RAW) != 0) -#define NODE_STRING_IS_AMBIG(node) \ - (((node)->u.str.flag & NODE_STRING_AMBIG) != 0) -#define NODE_STRING_IS_GOOD_AMBIG(node) \ - (((node)->u.str.flag & NODE_STRING_GOOD_AMBIG) != 0) -#define NODE_STRING_IS_DONT_GET_OPT_INFO(node) \ - (((node)->u.str.flag & NODE_STRING_DONT_GET_OPT_INFO) != 0) +#define NODE_STRING_SET_CRUDE(node) (node)->u.str.flag |= NODE_STRING_CRUDE +#define NODE_STRING_CLEAR_CRUDE(node) (node)->u.str.flag &= ~NODE_STRING_CRUDE +#define NODE_STRING_SET_CASE_EXPANDED(node) (node)->u.str.flag |= NODE_STRING_CASE_EXPANDED +#define NODE_STRING_SET_CASE_FOLD_MATCH(node) (node)->u.str.flag |= NODE_STRING_CASE_FOLD_MATCH +#define NODE_STRING_IS_CRUDE(node) \ + (((node)->u.str.flag & NODE_STRING_CRUDE) != 0) +#define NODE_STRING_IS_CASE_EXPANDED(node) \ + (((node)->u.str.flag & NODE_STRING_CASE_EXPANDED) != 0) +#define NODE_STRING_IS_CASE_FOLD_MATCH(node) \ + (((node)->u.str.flag & NODE_STRING_CASE_FOLD_MATCH) != 0) #define BACKREFS_P(br) \ (IS_NOT_NULL((br)->back_dynamic) ? (br)->back_dynamic : (br)->back_static) @@ -326,6 +335,7 @@ typedef struct _Node { #define NODE_ST_FIXED_OPTION (1<<18) #define NODE_ST_PROHIBIT_RECURSION (1<<19) #define NODE_ST_SUPER (1<<20) +#define NODE_ST_EMPTY_STATUS_CHECK (1<<21) #define NODE_STATUS(node) (((Node* )node)->u.base.status) @@ -355,7 +365,10 @@ typedef struct _Node { ((NODE_STATUS(node) & NODE_ST_PROHIBIT_RECURSION) != 0) #define NODE_IS_STRICT_REAL_REPEAT(node) \ ((NODE_STATUS(node) & NODE_ST_STRICT_REAL_REPEAT) != 0) +#define NODE_IS_EMPTY_STATUS_CHECK(node) \ + ((NODE_STATUS(node) & NODE_ST_EMPTY_STATUS_CHECK) != 0) +#define NODE_PARENT(node) ((node)->u.base.parent) #define NODE_BODY(node) ((node)->u.base.body) #define NODE_QUANT_BODY(node) ((node)->body) #define NODE_BAG_BODY(node) ((node)->body) @@ -368,11 +381,8 @@ typedef struct _Node { (senv)->mem_env_dynamic : (senv)->mem_env_static) typedef struct { - Node* node; -#if 0 - int in; - int recursion; -#endif + Node* mem_node; + Node* empty_repeat_node; } MemEnv; typedef struct { @@ -384,9 +394,8 @@ typedef struct { OnigCaseFoldType case_fold_flag; OnigEncoding enc; OnigSyntaxType* syntax; - MemStatusType capture_history; - MemStatusType bt_mem_start; - MemStatusType bt_mem_end; + MemStatusType cap_history; + MemStatusType backtrack_mem; /* backtrack/recursion */ MemStatusType backrefed_mem; UChar* pattern; UChar* pattern_end; @@ -404,7 +413,10 @@ typedef struct { MemEnv mem_env_static[SCANENV_MEMENV_SIZE]; MemEnv* mem_env_dynamic; unsigned int parse_depth; - +#ifdef ONIG_DEBUG_PARSE + unsigned int max_parse_depth; +#endif + int backref_num; int keep_num; int save_num; int save_alloc_num; @@ -425,9 +437,7 @@ extern int onig_renumber_name_table P_((regex_t* reg, GroupNumRemap* map)); extern int onig_strncmp P_((const UChar* s1, const UChar* s2, int n)); extern void onig_strcpy P_((UChar* dest, const UChar* src, const UChar* end)); extern void onig_scan_env_set_error_string P_((ScanEnv* env, int ecode, UChar* arg, UChar* arg_end)); -extern int onig_scan_unsigned_number P_((UChar** src, const UChar* end, OnigEncoding enc)); -extern void onig_reduce_nested_quantifier P_((Node* pnode, Node* cnode)); -extern void onig_node_conv_to_str_node P_((Node* node, int raw)); +extern int onig_reduce_nested_quantifier P_((Node* pnode)); extern int onig_node_str_cat P_((Node* node, const UChar* s, const UChar* end)); extern int onig_node_str_set P_((Node* node, const UChar* s, const UChar* end)); extern void onig_node_free P_((Node* node)); @@ -435,13 +445,13 @@ extern Node* onig_node_new_bag P_((enum BagType type)); extern Node* onig_node_new_anchor P_((int type, int ascii_mode)); extern Node* onig_node_new_str P_((const UChar* s, const UChar* end)); extern Node* onig_node_new_list P_((Node* left, Node* right)); -extern Node* onig_node_list_add P_((Node* list, Node* x)); extern Node* onig_node_new_alt P_((Node* left, Node* right)); extern void onig_node_str_clear P_((Node* node)); extern int onig_names_free P_((regex_t* reg)); extern int onig_parse_tree P_((Node** root, const UChar* pattern, const UChar* end, regex_t* reg, ScanEnv* env)); extern int onig_free_shared_cclass_table P_((void)); extern int onig_is_code_in_cc P_((OnigEncoding enc, OnigCodePoint code, CClassNode* cc)); +extern int onig_new_cclass_with_code_list(Node** rnode, OnigEncoding enc, int n, OnigCodePoint codes[]); extern OnigLen onig_get_tiny_min_len(Node* node, unsigned int inhibit_node_types, int* invalid_node); #ifdef USE_CALLOUT diff --git a/src/regposerr.c b/src/regposerr.c index e389531..e1747c5 100644 --- a/src/regposerr.c +++ b/src/regposerr.c @@ -2,7 +2,7 @@ regposerr.c - Oniguruma (regular expression library) **********************************************************************/ /*- - * Copyright (c) 2002-2019 K.Kosako + * Copyright (c) 2002-2019 K.Kosako * All rights reserved. * * Redistribution and use in source and binary forms, with or without diff --git a/src/regposix.c b/src/regposix.c index 09e16ac..b3e78ff 100644 --- a/src/regposix.c +++ b/src/regposix.c @@ -2,7 +2,7 @@ regposix.c - Oniguruma (regular expression library) **********************************************************************/ /*- - * Copyright (c) 2002-2019 K.Kosako + * Copyright (c) 2002-2019 K.Kosako * All rights reserved. * * Redistribution and use in source and binary forms, with or without diff --git a/src/regsyntax.c b/src/regsyntax.c index d4420cc..513c7f7 100644 --- a/src/regsyntax.c +++ b/src/regsyntax.c @@ -2,7 +2,7 @@ regsyntax.c - Oniguruma (regular expression library) **********************************************************************/ /*- - * Copyright (c) 2002-2019 K.Kosako + * Copyright (c) 2002-2019 K.Kosako * All rights reserved. * * Redistribution and use in source and binary forms, with or without diff --git a/src/regtrav.c b/src/regtrav.c index 58a17f5..8307695 100644 --- a/src/regtrav.c +++ b/src/regtrav.c @@ -2,7 +2,7 @@ regtrav.c - Oniguruma (regular expression library) **********************************************************************/ /*- - * Copyright (c) 2002-2004 K.Kosako + * Copyright (c) 2002-2019 K.Kosako * All rights reserved. * * Redistribution and use in source and binary forms, with or without diff --git a/src/regversion.c b/src/regversion.c index 594a52c..de993d3 100644 --- a/src/regversion.c +++ b/src/regversion.c @@ -2,7 +2,7 @@ regversion.c - Oniguruma (regular expression library) **********************************************************************/ /*- - * Copyright (c) 2002-2018 K.Kosako + * Copyright (c) 2002-2019 K.Kosako * All rights reserved. * * Redistribution and use in source and binary forms, with or without diff --git a/src/sjis.c b/src/sjis.c index 4f90b72..1fd92d9 100644 --- a/src/sjis.c +++ b/src/sjis.c @@ -2,7 +2,7 @@ sjis.c - Oniguruma (regular expression library) **********************************************************************/ /*- - * Copyright (c) 2002-2018 K.Kosako + * Copyright (c) 2002-2019 K.Kosako * All rights reserved. * * Redistribution and use in source and binary forms, with or without @@ -149,10 +149,6 @@ code_to_mbc(OnigCodePoint code, UChar *buf) if ((code & 0xff00) != 0) *p++ = (UChar )(((code >> 8) & 0xff)); *p++ = (UChar )(code & 0xff); -#if 0 - if (enclen(ONIG_ENCODING_SJIS, buf) != (p - buf)) - return REGERR_INVALID_CODE_POINT_VALUE; -#endif return (int )(p - buf); } @@ -179,31 +175,6 @@ mbc_case_fold(OnigCaseFoldType flag ARG_UNUSED, } } -#if 0 -static int -is_mbc_ambiguous(OnigCaseFoldType flag, const UChar** pp, const UChar* end) -{ - return onigenc_mbn_is_mbc_ambiguous(ONIG_ENCODING_SJIS, flag, pp, end); - -} -#endif - -#if 0 -static int -is_code_ctype(OnigCodePoint code, unsigned int ctype) -{ - if (code < 128) - return ONIGENC_IS_ASCII_CODE_CTYPE(code, ctype); - else { - if (CTYPE_IS_WORD_GRAPH_PRINT(ctype)) { - return (code_to_mbclen(code) > 1 ? TRUE : FALSE); - } - } - - return FALSE; -} -#endif - static UChar* left_adjust_char_head(const UChar* start, const UChar* s) { diff --git a/src/sjis_prop.c b/src/sjis_prop.c index 3a88a38..e33fbb2 100644 --- a/src/sjis_prop.c +++ b/src/sjis_prop.c @@ -1,5 +1,5 @@ /* ANSI-C code produced by gperf version 3.1 */ -/* Command-line: /usr/local/bin/gperf -pt -T -L ANSI-C -N onigenc_sjis_lookup_property_name --output-file gperf2.tmp sjis_prop.gperf */ +/* Command-line: gperf -pt -T -L ANSI-C -N onigenc_sjis_lookup_property_name --output-file gperf2.tmp sjis_prop.gperf */ /* Computed positions: -k'1,3' */ #if !((' ' == 32) && ('!' == 33) && ('"' == 34) && ('#' == 35) \ diff --git a/src/unicode.c b/src/unicode.c index 5820319..474436a 100644 --- a/src/unicode.c +++ b/src/unicode.c @@ -2,7 +2,7 @@ unicode.c - Oniguruma (regular expression library) **********************************************************************/ /*- - * Copyright (c) 2002-2019 K.Kosako + * Copyright (c) 2002-2019 K.Kosako * All rights reserved. * * Redistribution and use in source and binary forms, with or without @@ -356,16 +356,15 @@ onigenc_unicode_get_case_fold_codes_by_str(OnigEncoding enc, for (fn = 0; fn < 2; fn++) { int index; cs[fn][0] = FOLDS2_FOLD(buk->index)[fn]; + ncs[fn] = 1; index = onigenc_unicode_fold1_key(&cs[fn][0]); if (index >= 0) { int m = FOLDS1_UNFOLDS_NUM(index); for (i = 0; i < m; i++) { cs[fn][i+1] = FOLDS1_UNFOLDS(index)[i]; } - ncs[fn] = m + 1; + ncs[fn] += m; } - else - ncs[fn] = 1; } for (i = 0; i < ncs[0]; i++) { @@ -393,16 +392,15 @@ onigenc_unicode_get_case_fold_codes_by_str(OnigEncoding enc, for (fn = 0; fn < 3; fn++) { int index; cs[fn][0] = FOLDS3_FOLD(buk->index)[fn]; + ncs[fn] = 1; index = onigenc_unicode_fold1_key(&cs[fn][0]); if (index >= 0) { int m = FOLDS1_UNFOLDS_NUM(index); for (i = 0; i < m; i++) { cs[fn][i+1] = FOLDS1_UNFOLDS(index)[i]; } - ncs[fn] = m + 1; + ncs[fn] += m; } - else - ncs[fn] = 1; } for (i = 0; i < ncs[0]; i++) { diff --git a/src/unicode_egcb_data.c b/src/unicode_egcb_data.c index 6a74c77..3c49422 100644 --- a/src/unicode_egcb_data.c +++ b/src/unicode_egcb_data.c @@ -1,6 +1,6 @@ /* unicode_egcb_data.c: Generated by make_unicode_egcb_data.py. */ /*- - * Copyright (c) 2017-2018 K.Kosako + * Copyright (c) 2017-2019 K.Kosako * All rights reserved. * * Redistribution and use in source and binary forms, with or without @@ -25,7 +25,7 @@ * SUCH DAMAGE. */ -#define GRAPHEME_BREAK_PROPERTY_VERSION 12_1_0 +#define GRAPHEME_BREAK_PROPERTY_VERSION 120100 /* CR diff --git a/src/unicode_fold1_key.c b/src/unicode_fold1_key.c index b84b528..171a0fa 100644 --- a/src/unicode_fold1_key.c +++ b/src/unicode_fold1_key.c @@ -1,7 +1,7 @@ /* This file was converted by gperf_fold_key_conv.py from gperf output file. */ /* ANSI-C code produced by gperf version 3.1 */ -/* Command-line: /usr/local/bin/gperf -n -C -T -c -t -j1 -L ANSI-C -F,-1 -N onigenc_unicode_fold1_key unicode_fold1_key.gperf */ +/* Command-line: gperf -n -C -T -c -t -j1 -L ANSI-C -F,-1 -N onigenc_unicode_fold1_key unicode_fold1_key.gperf */ /* Computed positions: -k'1-3' */ @@ -9,7 +9,7 @@ /* This gperf source file was generated by make_unicode_fold_data.py */ /*- - * Copyright (c) 2017-2018 K.Kosako + * Copyright (c) 2017-2019 K.Kosako * All rights reserved. * * Redistribution and use in source and binary forms, with or without @@ -2983,7 +2983,7 @@ onigenc_unicode_fold1_key(OnigCodePoint codes[]) 4026 }; - if (0 == 0) + { int key = hash(codes); diff --git a/src/unicode_fold2_key.c b/src/unicode_fold2_key.c index 2310f0a..c39b19d 100644 --- a/src/unicode_fold2_key.c +++ b/src/unicode_fold2_key.c @@ -1,7 +1,7 @@ /* This file was converted by gperf_fold_key_conv.py from gperf output file. */ /* ANSI-C code produced by gperf version 3.1 */ -/* Command-line: /usr/local/bin/gperf -n -C -T -c -t -j1 -L ANSI-C -F,-1 -N onigenc_unicode_fold2_key unicode_fold2_key.gperf */ +/* Command-line: gperf -n -C -T -c -t -j1 -L ANSI-C -F,-1 -N onigenc_unicode_fold2_key unicode_fold2_key.gperf */ /* Computed positions: -k'3,6' */ @@ -9,7 +9,7 @@ /* This gperf source file was generated by make_unicode_fold_data.py */ /*- - * Copyright (c) 2017-2018 K.Kosako + * Copyright (c) 2017-2019 K.Kosako * All rights reserved. * * Redistribution and use in source and binary forms, with or without @@ -211,7 +211,7 @@ onigenc_unicode_fold2_key(OnigCodePoint codes[]) 129 }; - if (0 == 0) + { int key = hash(codes); diff --git a/src/unicode_fold3_key.c b/src/unicode_fold3_key.c index 0e02a62..295c447 100644 --- a/src/unicode_fold3_key.c +++ b/src/unicode_fold3_key.c @@ -1,7 +1,7 @@ /* This file was converted by gperf_fold_key_conv.py from gperf output file. */ /* ANSI-C code produced by gperf version 3.1 */ -/* Command-line: /usr/local/bin/gperf -n -C -T -c -t -j1 -L ANSI-C -F,-1 -N onigenc_unicode_fold3_key unicode_fold3_key.gperf */ +/* Command-line: gperf -n -C -T -c -t -j1 -L ANSI-C -F,-1 -N onigenc_unicode_fold3_key unicode_fold3_key.gperf */ /* Computed positions: -k'3,6,9' */ @@ -9,7 +9,7 @@ /* This gperf source file was generated by make_unicode_fold_data.py */ /*- - * Copyright (c) 2017-2018 K.Kosako + * Copyright (c) 2017-2019 K.Kosako * All rights reserved. * * Redistribution and use in source and binary forms, with or without @@ -121,7 +121,7 @@ onigenc_unicode_fold3_key(OnigCodePoint codes[]) 0 }; - if (0 == 0) + { int key = hash(codes); diff --git a/src/unicode_fold_data.c b/src/unicode_fold_data.c index 0dbf9ae..68694b0 100644 --- a/src/unicode_fold_data.c +++ b/src/unicode_fold_data.c @@ -1,7 +1,7 @@ /* This file was generated by make_unicode_fold_data.py. */ #include "regenc.h" -#define UNICODE_CASEFOLD_VERSION 12_1_0 +#define UNICODE_CASEFOLD_VERSION 120100 OnigCodePoint OnigUnicodeFolds1[] = { diff --git a/src/unicode_property_data.c b/src/unicode_property_data.c index 5c1c8a9..0083dd6 100644 --- a/src/unicode_property_data.c +++ b/src/unicode_property_data.c @@ -1,5 +1,5 @@ /* ANSI-C code produced by gperf version 3.1 */ -/* Command-line: /usr/local/bin/gperf -T -C -c -t -j1 -L ANSI-C --ignore-case --pic -Q unicode_prop_name_pool -N unicode_lookup_property_name --output-file gperf1.tmp unicode_property_data.gperf */ +/* Command-line: gperf -T -C -c -t -j1 -L ANSI-C --ignore-case --pic -Q unicode_prop_name_pool -N unicode_lookup_property_name --output-file gperf1.tmp unicode_property_data.gperf */ /* Computed positions: -k'1-3,5-6,12,16,$' */ #if !((' ' == 32) && ('!' == 33) && ('"' == 34) && ('#' == 35) \ @@ -29580,7 +29580,8 @@ unicode_lookup_property_name (register const char *str, register size_t len) -#define UNICODE_PROPERTY_VERSION 12_1_0 +#define UNICODE_PROPERTY_VERSION 120100 +#define UNICODE_EMOJI_VERSION 1201 #define PROPERTY_NAME_MAX_SIZE 59 #define CODE_RANGES_NUM 568 diff --git a/src/unicode_property_data_posix.c b/src/unicode_property_data_posix.c index eddc108..e299e85 100644 --- a/src/unicode_property_data_posix.c +++ b/src/unicode_property_data_posix.c @@ -1,5 +1,5 @@ /* ANSI-C code produced by gperf version 3.1 */ -/* Command-line: /usr/local/bin/gperf -T -C -c -t -j1 -L ANSI-C --ignore-case --pic -Q unicode_prop_name_pool -N unicode_lookup_property_name --output-file gperf2.tmp unicode_property_data_posix.gperf */ +/* Command-line: gperf -T -C -c -t -j1 -L ANSI-C --ignore-case --pic -Q unicode_prop_name_pool -N unicode_lookup_property_name --output-file gperf2.tmp unicode_property_data_posix.gperf */ /* Computed positions: -k'1,3' */ #if !((' ' == 32) && ('!' == 33) && ('"' == 34) && ('#' == 35) \ diff --git a/src/unicode_unfold_key.c b/src/unicode_unfold_key.c index b2228e0..51a037b 100644 --- a/src/unicode_unfold_key.c +++ b/src/unicode_unfold_key.c @@ -1,7 +1,7 @@ /* This file was converted by gperf_unfold_key_conv.py from gperf output file. */ /* ANSI-C code produced by gperf version 3.1 */ -/* Command-line: /usr/local/bin/gperf -n -C -T -c -t -j1 -L ANSI-C -F,-1,0 -N onigenc_unicode_unfold_key unicode_unfold_key.gperf */ +/* Command-line: gperf -n -C -T -c -t -j1 -L ANSI-C -F,-1,0 -N onigenc_unicode_unfold_key unicode_unfold_key.gperf */ /* Computed positions: -k'1-3' */ @@ -9,7 +9,7 @@ /* This gperf source file was generated by make_unicode_fold_data.py */ /*- - * Copyright (c) 2017-2018 K.Kosako + * Copyright (c) 2017-2019 K.Kosako * All rights reserved. * * Redistribution and use in source and binary forms, with or without @@ -3288,7 +3288,7 @@ onigenc_unicode_unfold_key(OnigCodePoint code) {0x1e907, 4005, 1} }; - if (0 == 0) + { int key = hash(&code); diff --git a/src/unicode_wb_data.c b/src/unicode_wb_data.c index 7778157..8e1a267 100644 --- a/src/unicode_wb_data.c +++ b/src/unicode_wb_data.c @@ -1,6 +1,6 @@ /* unicode_wb_data.c: Generated by make_unicode_wb_data.py. */ /*- - * Copyright (c) 2019 K.Kosako + * Copyright (c) 2019 K.Kosako * All rights reserved. * * Redistribution and use in source and binary forms, with or without @@ -25,7 +25,7 @@ * SUCH DAMAGE. */ -#define WORD_BREAK_PROPERTY_VERSION 12_1_0 +#define WORD_BREAK_PROPERTY_VERSION 120100 /* ALetter diff --git a/src/utf16_be.c b/src/utf16_be.c index b66d868..d99af71 100644 --- a/src/utf16_be.c +++ b/src/utf16_be.c @@ -2,7 +2,7 @@ utf16_be.c - Oniguruma (regular expression library) **********************************************************************/ /*- - * Copyright (c) 2002-2019 K.Kosako + * Copyright (c) 2002-2019 K.Kosako * All rights reserved. * * Redistribution and use in source and binary forms, with or without @@ -227,39 +227,6 @@ utf16be_mbc_case_fold(OnigCaseFoldType flag, pp, end, fold); } -#if 0 -static int -utf16be_is_mbc_ambiguous(OnigCaseFoldType flag, const UChar** pp, const UChar* end) -{ - const UChar* p = *pp; - - (*pp) += EncLen_UTF16[*p]; - - if (*p == 0) { - int c, v; - - p++; - if (*p == 0xdf && (flag & INTERNAL_ONIGENC_CASE_FOLD_MULTI_CHAR) != 0) { - return TRUE; - } - - c = *p; - v = ONIGENC_IS_UNICODE_ISO_8859_1_BIT_CTYPE(c, (BIT_CTYPE_UPPER | BIT_CTYPE_LOWER)); - - if ((v | BIT_CTYPE_LOWER) != 0) { - /* 0xaa, 0xb5, 0xba are lower case letter, but can't convert. */ - if (c >= 0xaa && c <= 0xba) - return FALSE; - else - return TRUE; - } - return (v != 0 ? TRUE : FALSE); - } - - return FALSE; -} -#endif - static UChar* utf16be_left_adjust_char_head(const UChar* start, const UChar* s) { diff --git a/src/utf16_le.c b/src/utf16_le.c index cdc74b0..c6edd94 100644 --- a/src/utf16_le.c +++ b/src/utf16_le.c @@ -2,7 +2,7 @@ utf16_le.c - Oniguruma (regular expression library) **********************************************************************/ /*- - * Copyright (c) 2002-2019 K.Kosako + * Copyright (c) 2002-2019 K.Kosako * All rights reserved. * * Redistribution and use in source and binary forms, with or without @@ -227,39 +227,6 @@ utf16le_mbc_case_fold(OnigCaseFoldType flag, fold); } -#if 0 -static int -utf16le_is_mbc_ambiguous(OnigCaseFoldType flag, const UChar** pp, - const UChar* end) -{ - const UChar* p = *pp; - - (*pp) += EncLen_UTF16[*(p+1)]; - - if (*(p+1) == 0) { - int c, v; - - if (*p == 0xdf && (flag & INTERNAL_ONIGENC_CASE_FOLD_MULTI_CHAR) != 0) { - return TRUE; - } - - c = *p; - v = ONIGENC_IS_UNICODE_ISO_8859_1_BIT_CTYPE(c, - (BIT_CTYPE_UPPER | BIT_CTYPE_LOWER)); - if ((v | BIT_CTYPE_LOWER) != 0) { - /* 0xaa, 0xb5, 0xba are lower case letter, but can't convert. */ - if (c >= 0xaa && c <= 0xba) - return FALSE; - else - return TRUE; - } - return (v != 0 ? TRUE : FALSE); - } - - return FALSE; -} -#endif - static UChar* utf16le_left_adjust_char_head(const UChar* start, const UChar* s) { diff --git a/src/utf32_be.c b/src/utf32_be.c index dd17d3b..67e50a2 100644 --- a/src/utf32_be.c +++ b/src/utf32_be.c @@ -2,7 +2,7 @@ utf32_be.c - Oniguruma (regular expression library) **********************************************************************/ /*- - * Copyright (c) 2002-2018 K.Kosako + * Copyright (c) 2002-2019 K.Kosako * All rights reserved. * * Redistribution and use in source and binary forms, with or without @@ -119,39 +119,6 @@ utf32be_mbc_case_fold(OnigCaseFoldType flag, fold); } -#if 0 -static int -utf32be_is_mbc_ambiguous(OnigCaseFoldType flag, const UChar** pp, const UChar* end) -{ - const UChar* p = *pp; - - (*pp) += 4; - - if (*(p+2) == 0 && *(p+1) == 0 && *p == 0) { - int c, v; - - p += 3; - if (*p == 0xdf && (flag & INTERNAL_ONIGENC_CASE_FOLD_MULTI_CHAR) != 0) { - return TRUE; - } - - c = *p; - v = ONIGENC_IS_UNICODE_ISO_8859_1_BIT_CTYPE(c, - (BIT_CTYPE_UPPER | BIT_CTYPE_LOWER)); - if ((v | BIT_CTYPE_LOWER) != 0) { - /* 0xaa, 0xb5, 0xba are lower case letter, but can't convert. */ - if (c >= 0xaa && c <= 0xba) - return FALSE; - else - return TRUE; - } - return (v != 0 ? TRUE : FALSE); - } - - return FALSE; -} -#endif - static UChar* utf32be_left_adjust_char_head(const UChar* start, const UChar* s) { diff --git a/src/utf32_le.c b/src/utf32_le.c index d9fe3c6..2ae2275 100644 --- a/src/utf32_le.c +++ b/src/utf32_le.c @@ -2,7 +2,7 @@ utf32_le.c - Oniguruma (regular expression library) **********************************************************************/ /*- - * Copyright (c) 2002-2018 K.Kosako + * Copyright (c) 2002-2019 K.Kosako * All rights reserved. * * Redistribution and use in source and binary forms, with or without @@ -120,38 +120,6 @@ utf32le_mbc_case_fold(OnigCaseFoldType flag, fold); } -#if 0 -static int -utf32le_is_mbc_ambiguous(OnigCaseFoldType flag, const UChar** pp, const UChar* end) -{ - const UChar* p = *pp; - - (*pp) += 4; - - if (*(p+1) == 0 && *(p+2) == 0 && *(p+3) == 0) { - int c, v; - - if (*p == 0xdf && (flag & INTERNAL_ONIGENC_CASE_FOLD_MULTI_CHAR) != 0) { - return TRUE; - } - - c = *p; - v = ONIGENC_IS_UNICODE_ISO_8859_1_BIT_CTYPE(c, - (BIT_CTYPE_UPPER | BIT_CTYPE_LOWER)); - if ((v | BIT_CTYPE_LOWER) != 0) { - /* 0xaa, 0xb5, 0xba are lower case letter, but can't convert. */ - if (c >= 0xaa && c <= 0xba) - return FALSE; - else - return TRUE; - } - return (v != 0 ? TRUE : FALSE); - } - - return FALSE; -} -#endif - static UChar* utf32le_left_adjust_char_head(const UChar* start, const UChar* s) { diff --git a/src/utf8.c b/src/utf8.c index 70c1503..1178d09 100644 --- a/src/utf8.c +++ b/src/utf8.c @@ -2,7 +2,7 @@ utf8.c - Oniguruma (regular expression library) **********************************************************************/ /*- - * Copyright (c) 2002-2019 K.Kosako + * Copyright (c) 2002-2019 K.Kosako * All rights reserved. * * Redistribution and use in source and binary forms, with or without @@ -97,33 +97,6 @@ is_valid_mbc_string(const UChar* p, const UChar* end) return TRUE; } -#if 0 -static int -is_mbc_newline(const UChar* p, const UChar* end) -{ - if (p < end) { - if (*p == 0x0a) return 1; - -#ifdef USE_UNICODE_ALL_LINE_TERMINATORS -#ifndef USE_CRNL_AS_LINE_TERMINATOR - if (*p == 0x0d) return 1; -#endif - if (p + 1 < end) { - if (*(p+1) == 0x85 && *p == 0xc2) /* U+0085 */ - return 1; - if (p + 2 < end) { - if ((*(p+2) == 0xa8 || *(p+2) == 0xa9) - && *(p+1) == 0x80 && *p == 0xe2) /* U+2028, U+2029 */ - return 1; - } - } -#endif - } - - return 0; -} -#endif - static OnigCodePoint mbc_to_code(const UChar* p, const UChar* end) { diff --git a/test/Makefile.am b/test/Makefile.am index 67b5d1e..4d62568 100644 --- a/test/Makefile.am +++ b/test/Makefile.am @@ -6,9 +6,9 @@ AM_CFLAGS = -Wall -Wno-invalid-source-encoding AM_CPPFLAGS = -I$(top_srcdir)/src if ENABLE_POSIX_API -TESTS = test_utf8 testc testp testcu +TESTS = test_utf8 testc testp testcu test_regset else -TESTS = test_utf8 testc testcu +TESTS = test_utf8 testc testcu test_regset endif check_PROGRAMS = $(TESTS) @@ -24,6 +24,9 @@ if ENABLE_POSIX_API endif @echo "[Oniguruma API, UTF-16 check]" @./testcu | grep RESULT + @echo "" + @echo "[Oniguruma API, regset check]" + @./test_regset test_uchar: @echo "[UChar in oniguruma.h check]" @@ -44,9 +47,13 @@ testp_CFLAGS = -DPOSIX_TEST -Wall -Wno-invalid-source-encoding testcu_SOURCES = testu.c testcu_LDADD = $(lib_onig) +test_regset_SOURCES = test_regset.c +test_regset_LDADD = $(lib_onig) + gcov: make CFLAGS="--coverage" test_utf8 make CFLAGS="--coverage" testc make CFLAGS="--coverage" testp make CFLAGS="--coverage" testcu + make CFLAGS="--coverage" test_regset diff --git a/test/test_regset.c b/test/test_regset.c new file mode 100644 index 0000000..497fbd6 --- /dev/null +++ b/test/test_regset.c @@ -0,0 +1,465 @@ +/* + * test_regset.c --- test for regset API + * Copyright (c) 2019 K.Kosako + */ +#include +#include +#include +#include + +#include "oniguruma.h" + +static int nsucc = 0; +static int nfail = 0; +static int nerror = 0; + + +static int +make_regset(int line_no, int n, char* pat[], OnigRegSet** rset, int error_no) +{ + int r; + int i; + OnigRegSet* set; + regex_t* reg; + OnigErrorInfo einfo; + + *rset = NULL; + r = onig_regset_new(&set, 0, NULL); + if (r != 0) return r; + + for (i = 0; i < n; i++) { + r = onig_new(®, (UChar* )pat[i], (UChar* )(pat[i] + strlen(pat[i])), + ONIG_OPTION_DEFAULT, ONIG_ENCODING_UTF8, ONIG_SYNTAX_DEFAULT, + &einfo); + if (r != 0) { + char s[ONIG_MAX_ERROR_MESSAGE_LEN]; + + if (error_no == 0) { + onig_error_code_to_str((UChar* )s, r, &einfo); + fprintf(stderr, "ERROR: %d: %s /%s/\n", line_no, s, pat[i]); + nerror++; + } + else { + if (r == error_no) { + fprintf(stdout, "OK(ERROR): %d: /%s/ %d\n", line_no, pat[i], r); + nsucc++; + } + else { + fprintf(stdout, "FAIL(ERROR): %d: /%s/ %d, %d\n", + line_no, pat[i], error_no, r); + nfail++; + } + } + return r; + } + + r = onig_regset_add(set, reg); + if (r != 0) { + onig_regset_free(set); + fprintf(stderr, "ERROR: %d: onig_regset_add(): /%s/\n", line_no, pat[i]); + nerror++; + return r; + } + } + + *rset = set; + return 0; +} + +#ifndef _WIN32 + +static double +get_sec(struct timespec* ts, struct timespec* te) +{ + double t; + + t = (te->tv_sec - ts->tv_sec) + + (double )(te->tv_nsec - ts->tv_nsec) / 1000000000.0; + return t; +} + +/* clock_gettime() doesn't exist in Windows */ + +static int +time_test(int repeat, int n, char* ps[], char* s, char* end, double* rt_set, double* rt_reg) +{ + int r; + int i; + int match_pos; + OnigRegSet* set; + struct timespec ts1, ts2; + double t_set, t_reg; + + r = make_regset(0, n, ps, &set, 0); + if (r != 0) return r; + + clock_gettime(CLOCK_PROCESS_CPUTIME_ID, &ts1); + + for (i = 0; i < repeat; i++) { + r = onig_regset_search(set, (UChar* )s, (UChar* )end, (UChar* )s, (UChar* )end, + ONIG_REGSET_POSITION_LEAD, ONIG_OPTION_NONE, &match_pos); + if (r < 0) { + fprintf(stderr, "FAIL onig_regset_search(POSITION_LEAD): %d\n", r); + return r; + } + } + + clock_gettime(CLOCK_PROCESS_CPUTIME_ID, &ts2); + t_set = get_sec(&ts1, &ts2); + + + clock_gettime(CLOCK_PROCESS_CPUTIME_ID, &ts1); + + for (i = 0; i < repeat; i++) { + r = onig_regset_search(set, (UChar* )s, (UChar* )end, (UChar* )s, (UChar* )end, + ONIG_REGSET_REGEX_LEAD, ONIG_OPTION_NONE, &match_pos); + if (r < 0) { + fprintf(stderr, "FAIL onig_regset_search(REGEX_LEAD): %d\n", r); + return r; + } + } + + clock_gettime(CLOCK_PROCESS_CPUTIME_ID, &ts2); + onig_regset_free(set); + + t_reg = get_sec(&ts1, &ts2); + + *rt_set = t_set; + *rt_reg = t_reg; + return 0; +} +#endif + +static void +fisher_yates_shuffle(int n, char* ps[], char* cps[]) +{ +#define GET_RAND(n) (rand()%(n+1)) +#define SWAP(a,b) { char* tmp = a; a = b; b = tmp; } + + int i; + + for (i = 0; i < n; i++) + cps[i] = ps[i]; + + for (i = n - 1; i > 0; i--) { + int x = GET_RAND(i); + SWAP(cps[i], cps[x]); + } +} + +#ifndef _WIN32 +static void +time_compare(int n, char* ps[], char* s, char* end) +{ + int r; + int i; + int repeat; + double t_set, t_reg; + double total_set, total_reg; + char** cps; + + cps = (char** )malloc(sizeof(char*) * n); + if (cps == 0) return ; + + repeat = 100 / n; + total_set = total_reg = 0.0; + for (i = 0; i < n; i++) { + fisher_yates_shuffle(n, ps, cps); + r = time_test(repeat, n, cps, s, end, &t_set, &t_reg); + if (r != 0) return ; + total_set += t_set; + total_reg += t_reg; + } + + free(cps); + + fprintf(stdout, "POS lead: %6.2lfmsec. REG lead: %6.2lfmsec.\n", + total_set * 1000.0, total_reg * 1000.0); +} +#endif + + +static OnigRegSetLead XX_LEAD = ONIG_REGSET_POSITION_LEAD; + +static void +xx(int line_no, int n, char* ps[], char* s, int from, int to, int mem, int not, int error_no) +{ + int r; + int match_pos; + int match_index; + OnigRegSet* set; + char *end; + + r = make_regset(line_no, n, ps, &set, error_no); + if (r != 0) return ; + + end = s + strlen(s); + + r = onig_regset_search(set, (UChar* )s, (UChar* )end, (UChar* )s, (UChar* )end, + XX_LEAD, ONIG_OPTION_NONE, &match_pos); + if (r < 0) { + if (r == ONIG_MISMATCH) { + if (not) { + fprintf(stdout, "OK(N): %d\n", line_no); + nsucc++; + } + else { + fprintf(stdout, "FAIL: %d\n", line_no); + nfail++; + } + } + else { + if (error_no == 0) { + char buf[ONIG_MAX_ERROR_MESSAGE_LEN]; + onig_error_code_to_str((UChar* )buf, r); + fprintf(stderr, "ERROR: %d: %s\n", line_no, buf); + nerror++; + } + else { + if (r == error_no) { + fprintf(stdout, "OK(ERROR): %d: %d\n", line_no, r); + nsucc++; + } + else { + fprintf(stdout, "FAIL ERROR NO: %d: %d, %d\n", line_no, error_no, r); + nfail++; + } + } + } + } + else { + if (not) { + fprintf(stdout, "FAIL(N): %d\n", line_no); + nfail++; + } + else { + OnigRegion* region; + + match_index = r; + region = onig_regset_get_region(set, match_index); + if (region == 0) { + fprintf(stderr, "ERROR: %d: can't get region.\n", line_no); + nerror++; + return ; + } + + if (region->beg[mem] == from && region->end[mem] == to) { + fprintf(stdout, "OK: %d\n", line_no); + nsucc++; + } + else { + char buf[1000]; + int len; + len = region->end[mem] - region->beg[mem]; + strncpy(buf, s + region->beg[mem], len); + buf[len] = '\0'; + fprintf(stdout, "FAIL: %d: %d-%d : %d-%d (%s)\n", line_no, + from, to, region->beg[mem], region->end[mem], buf); + nfail++; + } + } + } + + onig_regset_free(set); +} + +static void +x2(int line_no, int n, char* ps[], char* s, int from, int to) +{ + xx(line_no, n, ps, s, from, to, 0, 0, 0); +} + +static void +x3(int line_no, int n, char* ps[], char* s, int from, int to, int mem) +{ + xx(line_no, n, ps, s, from, to, mem, 0, 0); +} + +static void +n(int line_no, int n, char* ps[], char* s) +{ + xx(line_no, n, ps, s, 0, 0, 0, 1, 0); +} + +#define ASIZE(a) sizeof(a)/sizeof(a[0]) +#define X2(ps,s,from,to) x2(__LINE__,ASIZE(ps),ps,s,from,to) +#define X3(ps,s,from,to,mem) x3(__LINE__,ASIZE(ps),ps,s,from,to,mem) +#define N(ps,s) n(__LINE__,ASIZE(ps),ps,s) +#define NZERO(s) n(__LINE__,0,(char** )0,s) + +#ifndef _WIN32 + +/* getdelim() doesn't exist in Windows */ + +static int +get_all_content_of_file(char* path, char** rs, char** rend) +{ + size_t len; + size_t n; + char* line; + FILE* fp; + + fp = fopen(path, "r"); + if (fp == 0) return -1; + + n = 0; + line = NULL; + len = getdelim(&line, &n, EOF, fp); + fclose(fp); + if (len < 0) return -2; + + *rs = line; + *rend = line + len; + return 0; +} +#endif + + +#define TEXT_PATH "kofu-utf8.txt" + +/* --- To get kofu.txt --- + $ wget https://www.aozora.gr.jp/cards/000148/files/774_ruby_1640.zip + $ unzip 774_ruby_1640.zip + $ nkf -Lu -w8 kofu.txt > kofu-utf8.txt + (convert encoding to utf-8 with BOM and line terminator to be Unix-form) +*/ + +static char* p1[] = { + "abc", + "(bca)", + "(cab)" +}; + +static char* p2[] = { + "小説", + "9", + "夏目漱石", +}; + +static char* p3[] = { + "^いる。", + "^校正", + "^底本", + "^ 翌日", +}; + +static char* p4[] = { + "《[^》]{5}》", + "《[^》]{6}》", + "《[^》]{7}》", + "《[^》]{8}》", + "《[^》]{9}》", + "《[^》]{10}》", + "《[^》]{11}》", + "《[^》]{12}》", + "《[^》]{13}》", + "《[^》]{14}》", + "《[^》]{15}》", + "《[^》]{16}》", + "《[^》]{17}》", + "《[^》]{18}》", + "《[^》]{19}》", + "《[^》]{20}》", +}; + +static char* p5[] = { + "小室圭", + "bbbbbb", + "ドナルド・トランプ", + "筑摩書房", + "松原", + "aaaaaaaaa", + "bbbbbbbbb", + "ccccc", + "ddddddddddd", + "eee", + "ffffffffffff", + "gggggggggg", + "hhhhhhhhhhhhhh", + "iiiiiii", +}; + +static char* p6[] = { + "^.{1000,}", + "松原", + "小室圭", + "ドナルド・トランプ", + "筑摩書房", +}; + +static char* p7[] = { + "0+", "1+", "2+", "3+", "4+", "5+", "6+", "7+", "8+", "9+", +}; + +extern int +main(int argc, char* argv[]) +{ + int r; + int file_exist; + char *s, *end; + OnigEncoding use_encs[1]; + + use_encs[0] = ONIG_ENCODING_UTF8; + onig_initialize(use_encs, sizeof(use_encs)/sizeof(use_encs[0])); + + srand(12345); + + XX_LEAD = ONIG_REGSET_POSITION_LEAD; + + NZERO(" abab bccab ca"); + X2(p1, " abab bccab ca", 8, 11); + X3(p1, " abab bccab ca", 8, 11, 1); + N(p2, " XXXX AAA 1223 012345678bbb"); + X2(p2, "0123456789", 9, 10); + X2(p7, "abcde 555 qwert", 6, 9); + + XX_LEAD = ONIG_REGSET_REGEX_LEAD; + + NZERO(" abab bccab ca"); + X2(p1, " abab bccab ca", 8, 11); + X3(p1, " abab bccab ca", 8, 11, 1); + N(p2, " XXXX AAA 1223 012345678bbb"); + X2(p2, "0123456789", 9, 10); + X2(p7, "abcde 555 qwert", 6, 9); + +#ifndef _WIN32 + r = get_all_content_of_file(TEXT_PATH, &s, &end); + if (r == 0) { + fprintf(stdout, "FILE: %s, size: %d\n", TEXT_PATH, (int )(end - s)); + file_exist = 1; + } + else { + fprintf(stdout, "Ignore %s\n", TEXT_PATH); + file_exist = 0; + } +#else + file_exist = 0; +#endif + + if (file_exist != 0) { + X2(p2, s, 10, 22); + X2(p3, s, 496079, 496088); + X2(p4, s, 1294, 1315); + } + + fprintf(stdout, + "\nRESULT SUCC: %4d, FAIL: %d, ERROR: %d (by Oniguruma %s)\n", + nsucc, nfail, nerror, onig_version()); + + if (file_exist != 0) { +#ifndef _WIN32 + fprintf(stdout, "\n"); + time_compare(ASIZE(p2), p2, s, end); + time_compare(ASIZE(p3), p3, s, end); + time_compare(ASIZE(p4), p4, s, end); + time_compare(ASIZE(p5), p5, s, end); + time_compare(ASIZE(p6), p6, s, end); + fprintf(stdout, "\n"); +#endif + free(s); + } + + onig_end(); + + return ((nfail == 0 && nerror == 0) ? 0 : -1); +} diff --git a/test/test_utf8.c b/test/test_utf8.c index 2338526..d6fc761 100644 --- a/test/test_utf8.c +++ b/test/test_utf8.c @@ -132,8 +132,9 @@ static void e(char* pattern, char* str, int error_no) extern int main(int argc, char* argv[]) { - static OnigEncoding use_encs[] = { ONIG_ENCODING_UTF8 }; + OnigEncoding use_encs[1]; + use_encs[0] = ONIG_ENCODING_UTF8; onig_initialize(use_encs, sizeof(use_encs)/sizeof(use_encs[0])); err_file = stdout; @@ -298,6 +299,8 @@ extern int main(int argc, char* argv[]) x2("(?i:xssy)", "xs\xc5\xbfy", 0, 5); x2("(?i:xssy)", "x\xc3\x9fy", 0, 4); x2("(?i:xssy)", "x\xe1\xba\x9ey", 0, 5); + x2("(?i:x\xc3\x9fy)", "xssy", 0, 4); + x2("(?i:x\xc3\x9fy)", "xSSy", 0, 4); x2("(?i:\xc3\x9f)", "ss", 0, 2); x2("(?i:\xc3\x9f)", "SS", 0, 2); x2("(?i:[\xc3\x9f])", "ss", 0, 2); @@ -1204,6 +1207,78 @@ extern int main(int argc, char* argv[]) x2("a{2,3}+a", "aaa", 0, 3); /* == (?:a{2,3})+*/ x2("[\\x{0}-\\x{7fffffff}]", "a", 0, 1); x2("[\\x{7f}-\\x{7fffffff}]", "\xe5\xae\xb6", 0, 3); + x2("[a[cdef]]", "a", 0, 1); + n("[a[xyz]-c]", "b"); + x2("[a[xyz]-c]", "a", 0, 1); + x2("[a[xyz]-c]", "-", 0, 1); + x2("[a[xyz]-c]", "c", 0, 1); + + x2("((?(a)\\g<1>|b))", "aab", 0, 3); + x2("((?(a)\\g<1>))", "aab", 0, 2); + x2("(b(?(a)|\\g<1>))", "bba", 0, 3); + e("(()(?(2)\\g<1>))", "", ONIGERR_NEVER_ENDING_RECURSION); + + x2("(?i)st", "st", 0, 2); + x2("(?i)st", "St", 0, 2); + x2("(?i)st", "sT", 0, 2); + x2("(?i)st", "\xC5\xBFt", 0, 3); // U+017F + x2("(?i)st", "\xEF\xAC\x85", 0, 3); // U+FB05 + x2("(?i)st", "\xEF\xAC\x86", 0, 3); // U+FB06 + x2("(?i)ast", "Ast", 0, 3); + x2("(?i)ast", "ASt", 0, 3); + x2("(?i)ast", "AsT", 0, 3); + x2("(?i)ast", "A\xC5\xBFt", 0, 4); // U+017F + x2("(?i)ast", "A\xEF\xAC\x85", 0, 4); // U+FB05 + x2("(?i)ast", "A\xEF\xAC\x86", 0, 4); // U+FB06 + x2("(?i)stZ", "stz", 0, 3); + x2("(?i)stZ", "Stz", 0, 3); + x2("(?i)stZ", "sTz", 0, 3); + x2("(?i)stZ", "\xC5\xBFtz", 0, 4); // U+017F + x2("(?i)stZ", "\xEF\xAC\x85z", 0, 4); // U+FB05 + x2("(?i)stZ", "\xEF\xAC\x86z", 0, 4); // U+FB06 + x2("(?i)BstZ", "bstz", 0, 4); + x2("(?i)BstZ", "bStz", 0, 4); + x2("(?i)BstZ", "bsTz", 0, 4); + x2("(?i)BstZ", "b\xC5\xBFtz", 0, 5); // U+017F + x2("(?i)BstZ", "b\xEF\xAC\x85z", 0, 5); // U+FB05 + x2("(?i)BstZ", "b\xEF\xAC\x86z", 0, 5); // U+FB06 + x2("(?i).*st\\z", "tttssss\xC5\xBFt", 0, 10); // U+017F + x2("(?i).*st\\z", "tttssss\xEF\xAC\x85", 0, 10); // U+FB05 + x2("(?i).*st\\z", "tttssss\xEF\xAC\x86", 0, 10); // U+FB06 + x2("(?i).*あstい\\z", "tttssssあ\xC5\xBFtい", 0, 16); // U+017F + x2("(?i).*あstい\\z", "tttssssあ\xEF\xAC\x85い", 0, 16); // U+FB05 + x2("(?i).*あstい\\z", "tttssssあ\xEF\xAC\x86い", 0, 16); // U+FB06 + x2("(?i).*\xC5\xBFt\\z", "tttssssst", 0, 9); // U+017F + x2("(?i).*\xEF\xAC\x85\\z", "tttssssあst", 0, 12); // U+FB05 + x2("(?i).*\xEF\xAC\x86い\\z", "tttssssstい", 0, 12); // U+FB06 + x2("(?i).*\xEF\xAC\x85\\z", "tttssssあ\xEF\xAC\x85", 0, 13); + + x2("(?i).*ss", "abcdefghijklmnopqrstuvwxyz\xc3\x9f", 0, 28); // U+00DF + x2("(?i).*ss.*", "abcdefghijklmnopqrstuvwxyz\xc3\x9fxyz", 0, 31); // U+00DF + x2("(?i).*\xc3\x9f", "abcdefghijklmnopqrstuvwxyzss", 0, 28); // U+00DF + x2("(?i).*ss.*", "abcdefghijklmnopqrstuvwxyzSSxyz", 0, 31); + + x2("(?i)ssv", "\xc3\x9fv", 0, 3); // U+00DF + x2("(?i)(?<=ss)v", "SSv", 2, 3); + x2("(?i)(?<=\xc3\x9f)v", "\xc3\x9fv", 2, 3); + //x2("(?i)(?<=\xc3\x9f)v", "ssv", 2, 3); + //x2("(?i)(?<=ss)v", "\xc3\x9fv", 2, 3); + + /* #156 U+01F0 (UTF-8: C7 B0) */ + x2("(?i).+Isssǰ", ".+Isssǰ", 0, 8); + x2(".+Isssǰ", ".+Isssǰ", 0, 8); + x2("(?i)ǰ", "ǰ", 0, 2); + x2("(?i)ǰ", "j\xcc\x8c", 0, 3); + x2("(?i)j\xcc\x8c", "ǰ", 0, 2); + x2("(?i)5ǰ", "5ǰ", 0, 3); + x2("(?i)5ǰ", "5j\xcc\x8c", 0, 4); + x2("(?i)5j\xcc\x8c", "5ǰ", 0, 3); + x2("(?i)ǰv", "ǰV", 0, 3); + x2("(?i)ǰv", "j\xcc\x8cV", 0, 4); + x2("(?i)j\xcc\x8cv", "ǰV", 0, 3); + x2("(?i)[ǰ]", "ǰ", 0, 2); + x2("(?i)[ǰ]", "j\xcc\x8c", 0, 3); + //x2("(?i)[j]\xcc\x8c", "ǰ", 0, 2); n(" \xfd", ""); /* https://bugs.php.net/bug.php?id=77370 */ /* can't use \xfc00.. because compiler error: hex escape sequence out of range */ @@ -1212,7 +1287,10 @@ extern int main(int argc, char* argv[]) e("(?i)000000000000000000000\xf0", "", ONIGERR_INVALID_CODE_POINT_VALUE); /* https://bugs.php.net/bug.php?id=77382 */ n("0000\\\xf5", "0"); /* https://bugs.php.net/bug.php?id=77385 */ n("(?i)FFF00000000000000000\xfd", ""); /* https://bugs.php.net/bug.php?id=77394 */ - + e("x{55380}{77590}", "", ONIGERR_TOO_BIG_NUMBER_FOR_REPEAT_RANGE); + e("(xyz){40000}{99999}(?vv)", "", ONIGERR_TOO_BIG_NUMBER_FOR_REPEAT_RANGE); + e("f{90000,90000}{80000,80000}", "", ONIGERR_TOO_BIG_NUMBER_FOR_REPEAT_RANGE); + n("f{90000,90000}{80000,80001}", ""); x2("\\p{Common}", "\xe3\x8b\xbf", 0, 3); /* U+32FF */ x2("\\p{In_Enclosed_CJK_Letters_and_Months}", "\xe3\x8b\xbf", 0, 3); /* U+32FF */ diff --git a/test/testc.c b/test/testc.c index c3174cd..5c60764 100644 --- a/test/testc.c +++ b/test/testc.c @@ -153,8 +153,9 @@ static void n(char* pattern, char* str) extern int main(int argc, char* argv[]) { #ifndef POSIX_TEST - static OnigEncoding use_encs[] = { ONIG_ENCODING_EUC_JP }; + OnigEncoding use_encs[1]; + use_encs[0] = ONIG_ENCODING_EUC_JP; onig_initialize(use_encs, sizeof(use_encs)/sizeof(use_encs[0])); #endif diff --git a/test/testu.c b/test/testu.c index 397da95..24397ab 100644 --- a/test/testu.c +++ b/test/testu.c @@ -190,8 +190,9 @@ static void n(char* pattern, char* str) extern int main(int argc, char* argv[]) { - static OnigEncoding use_encs[] = { ONIG_ENCODING_UTF16_BE }; + OnigEncoding use_encs[1]; + use_encs[0] = ONIG_ENCODING_UTF16_BE; onig_initialize(use_encs, sizeof(use_encs)/sizeof(use_encs[0])); err_file = stdout; -- cgit v1.2.3 From cd957790e3a1c549cf86a8991f03a39d975fe3bf Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?J=C3=B6rg=20Frings-F=C3=BCrst?= Date: Fri, 29 Nov 2019 11:46:56 +0100 Subject: Refresh symbols file --- debian/changelog | 4 ++-- debian/symbols | 15 ++++++++++----- 2 files changed, 12 insertions(+), 7 deletions(-) diff --git a/debian/changelog b/debian/changelog index d88a462..83049ff 100644 --- a/debian/changelog +++ b/debian/changelog @@ -1,4 +1,4 @@ -libonig (6.9.3-1) UNRELEASED; urgency=medium +libonig (6.9.4-1) UNRELEASED; urgency=medium * Neu upstream release. - Refresh symbols file and add Build-Depends-Package field. @@ -8,7 +8,7 @@ libonig (6.9.3-1) UNRELEASED; urgency=medium - Refresh debain/copyright. * debian/watch:_Correct typo. - -- Jörg Frings-Fürst Wed, 07 Aug 2019 09:33:40 +0200 + -- Jörg Frings-Fürst Fri, 29 Nov 2019 11:27:09 +0100 libonig (6.9.2-1) unstable; urgency=medium diff --git a/debian/symbols b/debian/symbols index 89468b0..19e8a59 100644 --- a/debian/symbols +++ b/debian/symbols @@ -143,12 +143,11 @@ libonig.so.5 libonig5 #MINVER# onig_name_to_group_numbers@Base 6.8.1 onig_names_free@Base 6.8.1 onig_new@Base 6.8.1 + onig_new_cclass_with_code_list@Base 6.9.4 onig_new_deluxe@Base 6.8.1 onig_new_match_param@Base 6.8.1 onig_new_without_alloc@Base 6.8.1 - onig_node_conv_to_str_node@Base 6.8.1 onig_node_free@Base 6.8.1 - onig_node_list_add@Base 6.8.1 onig_node_new_alt@Base 6.8.1 onig_node_new_anchor@Base 6.8.1 onig_node_new_bag@Base 6.9.1 @@ -174,10 +173,18 @@ libonig.so.5 libonig5 #MINVER# onig_region_new@Base 6.8.1 onig_region_resize@Base 6.8.1 onig_region_set@Base 6.8.1 + onig_regset_add@Base 6.9.4 + onig_regset_free@Base 6.9.4 + onig_regset_get_regex@Base 6.9.4 + onig_regset_get_region@Base 6.9.4 + onig_regset_new@Base 6.9.4 + onig_regset_number_of_regex@Base 6.9.4 + onig_regset_replace@Base 6.9.4 + onig_regset_search@Base 6.9.4 + onig_regset_search_with_param@Base 6.9.4 onig_renumber_name_table@Base 6.8.1 onig_scan@Base 6.8.1 onig_scan_env_set_error_string@Base 6.8.1 - onig_scan_unsigned_number@Base 6.8.1 onig_search@Base 6.8.1 onig_search_with_param@Base 6.8.1 onig_set_callout_data@Base 6.8.1 @@ -254,10 +261,8 @@ libonig.so.5 libonig5 #MINVER# onigenc_is_valid_mbc_string@Base 6.8.1 onigenc_length_check_is_valid_mbc_string@Base 6.8.1 onigenc_mb2_code_to_mbc@Base 6.8.1 - onigenc_mb2_code_to_mbclen@Base 6.8.1 onigenc_mb2_is_code_ctype@Base 6.8.1 onigenc_mb4_code_to_mbc@Base 6.8.1 - onigenc_mb4_code_to_mbclen@Base 6.8.1 onigenc_mb4_is_code_ctype@Base 6.8.1 onigenc_mbn_mbc_case_fold@Base 6.8.1 onigenc_mbn_mbc_to_code@Base 6.8.1 -- cgit v1.2.3 From 6242ce2fd8f4e293f6acbd828d268a917a1fd10c Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?J=C3=B6rg=20Frings-F=C3=BCrst?= Date: Fri, 29 Nov 2019 11:50:31 +0100 Subject: Declare compliance with Debian Policy 4.4.1.1 --- debian/changelog | 1 + debian/control | 2 +- 2 files changed, 2 insertions(+), 1 deletion(-) diff --git a/debian/changelog b/debian/changelog index 83049ff..c654cd3 100644 --- a/debian/changelog +++ b/debian/changelog @@ -7,6 +7,7 @@ libonig (6.9.4-1) UNRELEASED; urgency=medium + 0110-CVE-2019-13225.patch - Refresh debain/copyright. * debian/watch:_Correct typo. + * Declare compliance with Debian Policy 4.4.1.1 (No changes needed). -- Jörg Frings-Fürst Fri, 29 Nov 2019 11:27:09 +0100 diff --git a/debian/control b/debian/control index bad90d6..537bc40 100644 --- a/debian/control +++ b/debian/control @@ -4,7 +4,7 @@ Priority: extra Maintainer: Jörg Frings-Fürst Build-Depends: debhelper (>= 12) -Standards-Version: 4.4.0 +Standards-Version: 4.4.1 Homepage: https://github.com/kkos/oniguruma Vcs-Git: git://jff.email/opt/git/libonig.git Vcs-Browser: https://jff.email/cgit/libonig.git -- cgit v1.2.3 From 63bdac4134d7d74b5dcb65aef0d22c32cb7079a2 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?J=C3=B6rg=20Frings-F=C3=BCrst?= Date: Fri, 29 Nov 2019 12:05:42 +0100 Subject: Switch to debhelper-compat --- debian/changelog | 3 +++ debian/compat | 1 - debian/control | 2 +- 3 files changed, 4 insertions(+), 2 deletions(-) delete mode 100644 debian/compat diff --git a/debian/changelog b/debian/changelog index c654cd3..0fc7fa0 100644 --- a/debian/changelog +++ b/debian/changelog @@ -8,6 +8,9 @@ libonig (6.9.4-1) UNRELEASED; urgency=medium - Refresh debain/copyright. * debian/watch:_Correct typo. * Declare compliance with Debian Policy 4.4.1.1 (No changes needed). + * Switch to debhelper-compat: + - debian/control: change to debhelper-compat (=12) + - remove debian/compat -- Jörg Frings-Fürst Fri, 29 Nov 2019 11:27:09 +0100 diff --git a/debian/compat b/debian/compat deleted file mode 100644 index 48082f7..0000000 --- a/debian/compat +++ /dev/null @@ -1 +0,0 @@ -12 diff --git a/debian/control b/debian/control index 537bc40..d48d070 100644 --- a/debian/control +++ b/debian/control @@ -3,7 +3,7 @@ Section: libs Priority: extra Maintainer: Jörg Frings-Fürst Build-Depends: - debhelper (>= 12) + debhelper-compat (= 12) Standards-Version: 4.4.1 Homepage: https://github.com/kkos/oniguruma Vcs-Git: git://jff.email/opt/git/libonig.git -- cgit v1.2.3 From d71526b1c053fc69aaec2d2465c6d416633662fe Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?J=C3=B6rg=20Frings-F=C3=BCrst?= Date: Fri, 29 Nov 2019 12:18:09 +0100 Subject: d/contol: Add Rules-Requires-Root: binary-targets --- debian/changelog | 2 ++ debian/control | 1 + 2 files changed, 3 insertions(+) diff --git a/debian/changelog b/debian/changelog index 0fc7fa0..8376088 100644 --- a/debian/changelog +++ b/debian/changelog @@ -11,6 +11,8 @@ libonig (6.9.4-1) UNRELEASED; urgency=medium * Switch to debhelper-compat: - debian/control: change to debhelper-compat (=12) - remove debian/compat + * debian/control: + - Add Rules-Requires-Root: binary-targets. -- Jörg Frings-Fürst Fri, 29 Nov 2019 11:27:09 +0100 diff --git a/debian/control b/debian/control index d48d070..b62f093 100644 --- a/debian/control +++ b/debian/control @@ -5,6 +5,7 @@ Maintainer: Jörg Frings-Fürst Build-Depends: debhelper-compat (= 12) Standards-Version: 4.4.1 +Rules-Requires-Root: binary-targets Homepage: https://github.com/kkos/oniguruma Vcs-Git: git://jff.email/opt/git/libonig.git Vcs-Browser: https://jff.email/cgit/libonig.git -- cgit v1.2.3 From 101f52fb0d2c7213ea63c29b031489fb22c2b8a5 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?J=C3=B6rg=20Frings-F=C3=BCrst?= Date: Fri, 29 Nov 2019 12:34:20 +0100 Subject: d/changelog: Add some fixed CVEs --- debian/changelog | 8 ++++++++ 1 file changed, 8 insertions(+) diff --git a/debian/changelog b/debian/changelog index 8376088..5106128 100644 --- a/debian/changelog +++ b/debian/changelog @@ -6,6 +6,14 @@ libonig (6.9.4-1) UNRELEASED; urgency=medium + 0105-CVE-2019-13224.patch + 0110-CVE-2019-13225.patch - Refresh debain/copyright. + - Fixes CVE-2019-19204: heap-buffer-overflow in fetch_interval_quantifier + due to double PFETCH (Closes: #945313). + - Fixes CVE-2019-19203: heap-buffer-overflow in gb18030_mbc_enc_len + (Closes: #945312). + - Fixes CVE-2019-19012: Out of bounds read in mbc_to_code() + (Closes: #944959). + - Fixes CVE-2019-16163: Stack Exhaustion Problem (Closes: #939988). + - Fixes CVE-2019-19246: heap-based buffer over-read in str_lower_case_match. * debian/watch:_Correct typo. * Declare compliance with Debian Policy 4.4.1.1 (No changes needed). * Switch to debhelper-compat: -- cgit v1.2.3 From c347fb41cbf06a1b2640758151c9d132d70fd298 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?J=C3=B6rg=20Frings-F=C3=BCrst?= Date: Fri, 29 Nov 2019 12:46:35 +0100 Subject: d/changelog: Change distribution to unstable, Change date and time --- debian/changelog | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/debian/changelog b/debian/changelog index 5106128..63c1391 100644 --- a/debian/changelog +++ b/debian/changelog @@ -1,4 +1,4 @@ -libonig (6.9.4-1) UNRELEASED; urgency=medium +libonig (6.9.4-1) unstable; urgency=medium * Neu upstream release. - Refresh symbols file and add Build-Depends-Package field. @@ -22,7 +22,7 @@ libonig (6.9.4-1) UNRELEASED; urgency=medium * debian/control: - Add Rules-Requires-Root: binary-targets. - -- Jörg Frings-Fürst Fri, 29 Nov 2019 11:27:09 +0100 + -- Jörg Frings-Fürst Fri, 29 Nov 2019 12:45:36 +0100 libonig (6.9.2-1) unstable; urgency=medium -- cgit v1.2.3 From f0f2976752d54aa632bd5b1e0d225b95f0413734 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?J=C3=B6rg=20Frings-F=C3=BCrst?= Date: Sun, 22 Dec 2019 15:58:37 +0100 Subject: Change Rules-Requires-Root to no --- debian/changelog | 2 +- debian/control | 4 ++-- 2 files changed, 3 insertions(+), 3 deletions(-) diff --git a/debian/changelog b/debian/changelog index 63c1391..a5cadda 100644 --- a/debian/changelog +++ b/debian/changelog @@ -20,7 +20,7 @@ libonig (6.9.4-1) unstable; urgency=medium - debian/control: change to debhelper-compat (=12) - remove debian/compat * debian/control: - - Add Rules-Requires-Root: binary-targets. + - Add Rules-Requires-Root: no. -- Jörg Frings-Fürst Fri, 29 Nov 2019 12:45:36 +0100 diff --git a/debian/control b/debian/control index b62f093..a277d0f 100644 --- a/debian/control +++ b/debian/control @@ -4,8 +4,8 @@ Priority: extra Maintainer: Jörg Frings-Fürst Build-Depends: debhelper-compat (= 12) -Standards-Version: 4.4.1 -Rules-Requires-Root: binary-targets +Standards-Version: 4.4.1.1 +Rules-Requires-Root: no Homepage: https://github.com/kkos/oniguruma Vcs-Git: git://jff.email/opt/git/libonig.git Vcs-Browser: https://jff.email/cgit/libonig.git -- cgit v1.2.3 From 09ad01174dfbfe683e0ed840d6469108d1b13d64 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?J=C3=B6rg=20Frings-F=C3=BCrst?= Date: Sun, 22 Dec 2019 16:00:30 +0100 Subject: Remove outdated debian/NEWS.Debian --- debian/NEWS.Debian | 6 ------ debian/changelog | 1 + 2 files changed, 1 insertion(+), 6 deletions(-) delete mode 100644 debian/NEWS.Debian diff --git a/debian/NEWS.Debian b/debian/NEWS.Debian deleted file mode 100644 index e58552a..0000000 --- a/debian/NEWS.Debian +++ /dev/null @@ -1,6 +0,0 @@ -libonig (6.0.0-1) unstable; urgency=medium - - The file /usr/bin/onig-config is not Multi-Arch conform. So it is removed - in this release. You can use pkg-config instead. - - -- Jörg Frings-Fürst Tue, 06 Jan 2015 11:09:12 +0100 diff --git a/debian/changelog b/debian/changelog index a5cadda..6a8ff81 100644 --- a/debian/changelog +++ b/debian/changelog @@ -21,6 +21,7 @@ libonig (6.9.4-1) unstable; urgency=medium - remove debian/compat * debian/control: - Add Rules-Requires-Root: no. + * Remove outdated debian/NEWS.Debian. -- Jörg Frings-Fürst Fri, 29 Nov 2019 12:45:36 +0100 -- cgit v1.2.3 From 091456e1a135d4674701a264495bd34918779391 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?J=C3=B6rg=20Frings-F=C3=BCrst?= Date: Sun, 22 Dec 2019 16:01:54 +0100 Subject: d/changelog: Change date and time --- debian/changelog | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/debian/changelog b/debian/changelog index 6a8ff81..8dada6e 100644 --- a/debian/changelog +++ b/debian/changelog @@ -23,7 +23,7 @@ libonig (6.9.4-1) unstable; urgency=medium - Add Rules-Requires-Root: no. * Remove outdated debian/NEWS.Debian. - -- Jörg Frings-Fürst Fri, 29 Nov 2019 12:45:36 +0100 + -- Jörg Frings-Fürst Sun, 22 Dec 2019 16:00:46 +0100 libonig (6.9.2-1) unstable; urgency=medium -- cgit v1.2.3