diff options
author | Jörg Frings-Fürst <debian@jff.email> | 2021-04-26 17:40:21 +0200 |
---|---|---|
committer | Jörg Frings-Fürst <debian@jff.email> | 2021-04-26 17:40:21 +0200 |
commit | d3a83c35311ec631a46b59b66c38ef8d3a2a629a (patch) | |
tree | 28fc4dd524fa29f712020b61e565ab47b1fefd8e | |
parent | 77a04959299aa252579a98655e626d1b8f5f9f34 (diff) | |
parent | 98f7065a3f7b386564840bb5b24b94f9335b2e97 (diff) |
Update upstream source from tag 'upstream/6.9.7.1'
Update to upstream version '6.9.7.1'
with Debian dir c2c92e088b7e91033d7f5bee51ac7827148eaf4b
58 files changed, 4552 insertions, 2633 deletions
@@ -44,6 +44,7 @@ m4/*.m4 # test/ /test/test_utf8 +/test/test_options /test/testc /test/testcu /test/testp @@ -68,6 +69,7 @@ m4/*.m4 /sample/bug_fix /sample/regset /sample/scan +/sample/callback_each_match /sample/log* /harnesses/utf16*.dict diff --git a/CMakeLists.txt b/CMakeLists.txt index 06af497..1944037 100644 --- a/CMakeLists.txt +++ b/CMakeLists.txt @@ -1,6 +1,6 @@ cmake_minimum_required(VERSION 3.1) project(oniguruma - VERSION 6.9.6 + VERSION 6.9.7 LANGUAGES C) set(PACKAGE onig) @@ -100,6 +100,12 @@ if(MSVC) $<$<CONFIG:RelWithDebgInfo>:/MTd> ) endif() + if(MSVC_VERSION LESS_EQUAL "1800") + # <= VS2013 + target_compile_definitions(onig PRIVATE + -Dinline=__inline + ) + endif() elseif(CMAKE_COMPILER_IS_GNUCC) target_compile_options(onig PRIVATE -Wall @@ -1,7 +1,7 @@ Oniguruma LICENSE ----------------- -Copyright (c) 2002-2020 K.Kosako <kkosako0@gmail.com> +Copyright (c) 2002-2021 K.Kosako <kkosako0@gmail.com> All rights reserved. Redistribution and use in source and binary forms, with or without @@ -1,5 +1,25 @@ History +2021/04/15: Version 6.9.7 revised 1 + +2021/04/14: fix: replace UChar to OnigUChar in oniguruma.h + +2021/04/14: Version 6.9.7 +2021/03/31: Release Candidate 1 for Version 6.9.7 + +2021/03/23: fix Issue 32340, 32345, 32355 in oss-fuzz +2021/03/12: fix invalid optimization info for if-pattern (?(cond)...) +2021/02/21: NEW API: ONIG_OPTION_CALLBACK_EACH_MATCH +2021/02/02: fix Issue 30144 in oss-fuzz: Timeout +2021/01/18: NEW API: ONIG_SYNTAX_PYTHON +2020/12/20: fix Issue 28795 in oss-fuzz: Timeout +2020/12/13: fix Issue 28554 in oss-fuzz: Timeout, check very inefficient patterns at tune_tree(NODE_CALL) +2020/12/04: fix Issue 28259 in oss-fuzz: Timeout +2020/12/03: fix invalid reduction of nested quantifiers (?:<expr>+?)* and (?:<expr>+?)+ +2020/12/01: fix Issue 28104 in oss-fuzz: Timeout +2020/11/28: NEW API: ONIG_OPTION_IGNORECASE_IS_ASCII +2020/11/07: fix Issue 27015 in oss-fuzz: Timeout + 2020/11/05: Version 6.9.6 2020/11/01: fix Issue 26798 in oss-fuzz: Timeout @@ -2432,4 +2452,13 @@ cvs rtag "VERSION_X_X_X" oniguruma age: number of supported previous interfaces (if current only supported then age == 0) + +<add SHA256 checksum> +MacOS X +$ shasum -a 256 -b onig-X.Y.Z.tar.gz > onig-X.Y.Z.tar.gz.sha256 + +<check SHA256 checksum> +MacOS X +$ shasum -a 256 -c onig-X.Y.Z.tar.gz.sha256 + //END @@ -29,10 +29,21 @@ Supported character encodings: * doc/SYNTAX.md: contributed by seanofw -Version 6.9.6 +Notice (from 6.9.6) +------------------- +When using configure script, if you have the POSIX API enabled in an earlier version (disabled by default in 6.9.5) and you need application binary compatibility with the POSIX API, specify "--enable-binary-compatible-posix-api=yes" instead of "--enable-posix-api=yes". Starting in 6.9.6, "--enable-posix-api=yes" only supports source-level compatibility for 6.9.5 and earlier about POSIX API. (Issue #210) + + +Version 6.9.7 ------------- -* When using configure script, if you have the POSIX API enabled in an earlier version (disabled by default in 6.9.5) and you need application binary compatibility with the POSIX API, specify "--enable-binary-compatible-posix-api=yes" instead of "--enable-posix-api=yes". Starting in 6.9.6, "--enable-posix-api=yes" only supports source-level compatibility for 6.9.5 and earlier about POSIX API. (Issue #210) +* NEW API: ONIG_OPTION_CALLBACK_EACH_MATCH +* NEW API: ONIG_OPTION_IGNORECASE_IS_ASCII +* NEW API: ONIG_SYNTAX_PYTHON +* Fixed some problems found by OSS-Fuzz + +Version 6.9.6 +------------- * NEW: configure option --enable-binary-compatible-posix-api=[yes/no] * NEW API: Limiting the maximum number of calls of subexp-call * NEW API: ONIG_OPTION_NOT_BEGIN_STRING / NOT_END_STRING / NOT_BEGIN_POSITION @@ -102,69 +113,6 @@ Version 6.9.0 * NEW: add Emoji properties -Version 6.8.2 -------------- - -* Fix: #80 UChar in header causes issue -* NEW API: onig_set_callout_user_data_of_match_param() (* omission in 6.8.0) -* add doc/CALLOUTS.API and doc/CALLOUTS.API.ja - - -Version 6.8.1 -------------- - -* Update shared library version to 5.0.0 for API incompatible changes from 6.7.1 - - -Version 6.8.0 -------------- - -* Retry-limit-in-match function enabled by default -* NEW: configure option --enable-posix-api=no (* enabled by default) -* NEW API: onig_search_with_param(), onig_match_with_param() -* NEW: Callouts of contents (?{...contents...}) (?{...}\[tag]\[X<>]) (?{{...}}) -* NEW: Callouts of name (*name) (*name\[tag]{args...}) -* NEW: Builtin callouts (*FAIL) (*MISMATCH) (*ERROR{n}) (*COUNT) (*MAX{n}) etc.. -* Examples of Callouts program: [callout.c](sample/callout.c), [count.c](sample/count.c), [echo.c](sample/echo.c) - - -Version 6.7.1 -------------- - -* NEW: Mechanism of retry-limit-in-match (* disabled by default) - - -Version 6.7.0 -------------- - -* NEW: hexadecimal codepoint \uHHHH -* NEW: add ONIG_SYNTAX_ONIGURUMA (== ONIG_SYNTAX_DEFAULT) -* Disabled \N and \O on ONIG_SYNTAX_RUBY -* Reduced size of object file - - -Version 6.6.0 -------------- - -* NEW: ASCII only mode options for character type/property (?WDSP) -* NEW: Extended Grapheme Cluster boundary \y, \Y -* NEW: Extended Grapheme Cluster \X -* Range-clear (Absent-clear) operator restores previous range in retractions. - - -Version 6.5.0 -------------- - -* NEW: \K (keep) -* NEW: \R (general newline) \N (no newline) -* NEW: \O (true anychar) -* NEW: if-then-else (?(...)...\|...) -* NEW: Backreference validity checker (?(xxx)) (*original) -* NEW: Absent repeater (?~absent) \[is equal to (?\~\|(?:absent)|\O*)] -* NEW: Absent expression (?~|absent|expr) (*original) -* NEW: Absent stopper (?~|absent) (*original) - - License ------- @@ -205,18 +153,17 @@ Install ### Case 3: Windows 64/32bit platform (Visual Studio) - Execute make_win.bat + * build library + + .\make_win.bat onig_s.lib: static link library onig.dll: dynamic link library - * test (ASCII/Shift_JIS) + * make test programs - 1. cd src - 2. copy ..\windows\testc.c . - 3. nmake -f Makefile.windows ctest + .\make_win.bat all-test - (I have checked by Visual Studio Community 2015) Alternatively, you can build and install oniguruma using [vcpkg](https://github.com/microsoft/vcpkg/) dependency manager: @@ -3,7 +3,7 @@ scriptversion=2018-03-07.03; # UTC -# Copyright (C) 1999-2018 Free Software Foundation, Inc. +# Copyright (C) 1999-2020 Free Software Foundation, Inc. # Written by Tom Tromey <tromey@cygnus.com>. # # This program is free software; you can redistribute it and/or modify @@ -53,7 +53,7 @@ func_file_conv () MINGW*) file_conv=mingw ;; - CYGWIN*) + CYGWIN* | MSYS*) file_conv=cygwin ;; *) @@ -67,7 +67,7 @@ func_file_conv () mingw/*) file=`cmd //C echo "$file " | sed -e 's/"\(.*\) " *$/\1/'` ;; - cygwin/*) + cygwin/* | msys/*) file=`cygpath -m "$file" || echo "$file"` ;; wine/*) diff --git a/config.guess b/config.guess index 256083a..9aff91c 100755 --- a/config.guess +++ b/config.guess @@ -1,8 +1,8 @@ #! /bin/sh # Attempt to guess a canonical system name. -# Copyright 1992-2018 Free Software Foundation, Inc. +# Copyright 1992-2020 Free Software Foundation, Inc. -timestamp='2018-03-08' +timestamp='2020-08-17' # This file is free software; you can redistribute it and/or modify it # under the terms of the GNU General Public License as published by @@ -50,7 +50,7 @@ version="\ GNU config.guess ($timestamp) Originally written by Per Bothner. -Copyright 1992-2018 Free Software Foundation, Inc. +Copyright 1992-2020 Free Software Foundation, Inc. This is free software; see the source for copying conditions. There is NO warranty; not even for MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE." @@ -84,8 +84,6 @@ if test $# != 0; then exit 1 fi -trap 'exit 1' 1 2 15 - # CC_FOR_BUILD -- compiler used by this script. Note that the use of a # compiler to aid in system detection is discouraged as it requires # temporary files to be created and, as you can see below, it is a @@ -96,34 +94,40 @@ trap 'exit 1' 1 2 15 # Portable tmp directory creation inspired by the Autoconf team. -set_cc_for_build=' -trap "exitcode=\$?; (rm -f \$tmpfiles 2>/dev/null; rmdir \$tmp 2>/dev/null) && exit \$exitcode" 0 ; -trap "rm -f \$tmpfiles 2>/dev/null; rmdir \$tmp 2>/dev/null; exit 1" 1 2 13 15 ; -: ${TMPDIR=/tmp} ; - { tmp=`(umask 077 && mktemp -d "$TMPDIR/cgXXXXXX") 2>/dev/null` && test -n "$tmp" && test -d "$tmp" ; } || - { test -n "$RANDOM" && tmp=$TMPDIR/cg$$-$RANDOM && (umask 077 && mkdir $tmp) ; } || - { tmp=$TMPDIR/cg-$$ && (umask 077 && mkdir $tmp) && echo "Warning: creating insecure temp directory" >&2 ; } || - { echo "$me: cannot create a temporary directory in $TMPDIR" >&2 ; exit 1 ; } ; -dummy=$tmp/dummy ; -tmpfiles="$dummy.c $dummy.o $dummy.rel $dummy" ; -case $CC_FOR_BUILD,$HOST_CC,$CC in - ,,) echo "int x;" > "$dummy.c" ; - for c in cc gcc c89 c99 ; do - if ($c -c -o "$dummy.o" "$dummy.c") >/dev/null 2>&1 ; then - CC_FOR_BUILD="$c"; break ; - fi ; - done ; - if test x"$CC_FOR_BUILD" = x ; then - CC_FOR_BUILD=no_compiler_found ; - fi - ;; - ,,*) CC_FOR_BUILD=$CC ;; - ,*,*) CC_FOR_BUILD=$HOST_CC ;; -esac ; set_cc_for_build= ;' +tmp= +# shellcheck disable=SC2172 +trap 'test -z "$tmp" || rm -fr "$tmp"' 0 1 2 13 15 + +set_cc_for_build() { + # prevent multiple calls if $tmp is already set + test "$tmp" && return 0 + : "${TMPDIR=/tmp}" + # shellcheck disable=SC2039 + { tmp=`(umask 077 && mktemp -d "$TMPDIR/cgXXXXXX") 2>/dev/null` && test -n "$tmp" && test -d "$tmp" ; } || + { test -n "$RANDOM" && tmp=$TMPDIR/cg$$-$RANDOM && (umask 077 && mkdir "$tmp" 2>/dev/null) ; } || + { tmp=$TMPDIR/cg-$$ && (umask 077 && mkdir "$tmp" 2>/dev/null) && echo "Warning: creating insecure temp directory" >&2 ; } || + { echo "$me: cannot create a temporary directory in $TMPDIR" >&2 ; exit 1 ; } + dummy=$tmp/dummy + case ${CC_FOR_BUILD-},${HOST_CC-},${CC-} in + ,,) echo "int x;" > "$dummy.c" + for driver in cc gcc c89 c99 ; do + if ($driver -c -o "$dummy.o" "$dummy.c") >/dev/null 2>&1 ; then + CC_FOR_BUILD="$driver" + break + fi + done + if test x"$CC_FOR_BUILD" = x ; then + CC_FOR_BUILD=no_compiler_found + fi + ;; + ,,*) CC_FOR_BUILD=$CC ;; + ,*,*) CC_FOR_BUILD=$HOST_CC ;; + esac +} # This is needed to find uname on a Pyramid OSx when run in the BSD universe. # (ghazi@noc.rutgers.edu 1994-08-24) -if (test -f /.attbin/uname) >/dev/null 2>&1 ; then +if test -f /.attbin/uname ; then PATH=$PATH:/.attbin ; export PATH fi @@ -138,7 +142,7 @@ Linux|GNU|GNU/*) # We could probably try harder. LIBC=gnu - eval "$set_cc_for_build" + set_cc_for_build cat <<-EOF > "$dummy.c" #include <features.h> #if defined(__UCLIBC__) @@ -199,7 +203,7 @@ case "$UNAME_MACHINE:$UNAME_SYSTEM:$UNAME_RELEASE:$UNAME_VERSION" in os=netbsdelf ;; arm*|i386|m68k|ns32k|sh3*|sparc|vax) - eval "$set_cc_for_build" + set_cc_for_build if echo __ELF__ | $CC_FOR_BUILD -E - 2>/dev/null \ | grep -q __ELF__ then @@ -237,7 +241,7 @@ case "$UNAME_MACHINE:$UNAME_SYSTEM:$UNAME_RELEASE:$UNAME_VERSION" in # Since CPU_TYPE-MANUFACTURER-KERNEL-OPERATING_SYSTEM: # contains redundant information, the shorter form: # CPU_TYPE-MANUFACTURER-OPERATING_SYSTEM is used. - echo "$machine-${os}${release}${abi}" + echo "$machine-${os}${release}${abi-}" exit ;; *:Bitrig:*:*) UNAME_MACHINE_ARCH=`arch | sed 's/Bitrig.//'` @@ -260,6 +264,9 @@ case "$UNAME_MACHINE:$UNAME_SYSTEM:$UNAME_RELEASE:$UNAME_VERSION" in *:SolidBSD:*:*) echo "$UNAME_MACHINE"-unknown-solidbsd"$UNAME_RELEASE" exit ;; + *:OS108:*:*) + echo "$UNAME_MACHINE"-unknown-os108_"$UNAME_RELEASE" + exit ;; macppc:MirBSD:*:*) echo powerpc-unknown-mirbsd"$UNAME_RELEASE" exit ;; @@ -269,12 +276,15 @@ case "$UNAME_MACHINE:$UNAME_SYSTEM:$UNAME_RELEASE:$UNAME_VERSION" in *:Sortix:*:*) echo "$UNAME_MACHINE"-unknown-sortix exit ;; + *:Twizzler:*:*) + echo "$UNAME_MACHINE"-unknown-twizzler + exit ;; *:Redox:*:*) echo "$UNAME_MACHINE"-unknown-redox exit ;; mips:OSF1:*.*) - echo mips-dec-osf1 - exit ;; + echo mips-dec-osf1 + exit ;; alpha:OSF1:*:*) case $UNAME_RELEASE in *4.0) @@ -389,12 +399,12 @@ case "$UNAME_MACHINE:$UNAME_SYSTEM:$UNAME_RELEASE:$UNAME_VERSION" in echo i386-pc-auroraux"$UNAME_RELEASE" exit ;; i86pc:SunOS:5.*:* | i86xen:SunOS:5.*:*) - eval "$set_cc_for_build" + set_cc_for_build SUN_ARCH=i386 # If there is a compiler, see if it is configured for 64-bit objects. # Note that the Sun cc does not turn __LP64__ into 1 like gcc does. # This test works for both compilers. - if [ "$CC_FOR_BUILD" != no_compiler_found ]; then + if test "$CC_FOR_BUILD" != no_compiler_found; then if (echo '#ifdef __amd64'; echo IS_64BIT_ARCH; echo '#endif') | \ (CCOPTS="" $CC_FOR_BUILD -E - 2>/dev/null) | \ grep IS_64BIT_ARCH >/dev/null @@ -482,7 +492,7 @@ case "$UNAME_MACHINE:$UNAME_SYSTEM:$UNAME_RELEASE:$UNAME_VERSION" in echo clipper-intergraph-clix"$UNAME_RELEASE" exit ;; mips:*:*:UMIPS | mips:*:*:RISCos) - eval "$set_cc_for_build" + set_cc_for_build sed 's/^ //' << EOF > "$dummy.c" #ifdef __cplusplus #include <stdio.h> /* for printf() prototype */ @@ -534,10 +544,10 @@ EOF AViiON:dgux:*:*) # DG/UX returns AViiON for all architectures UNAME_PROCESSOR=`/usr/bin/uname -p` - if [ "$UNAME_PROCESSOR" = mc88100 ] || [ "$UNAME_PROCESSOR" = mc88110 ] + if test "$UNAME_PROCESSOR" = mc88100 || test "$UNAME_PROCESSOR" = mc88110 then - if [ "$TARGET_BINARY_INTERFACE"x = m88kdguxelfx ] || \ - [ "$TARGET_BINARY_INTERFACE"x = x ] + if test "$TARGET_BINARY_INTERFACE"x = m88kdguxelfx || \ + test "$TARGET_BINARY_INTERFACE"x = x then echo m88k-dg-dgux"$UNAME_RELEASE" else @@ -570,7 +580,7 @@ EOF echo i386-ibm-aix exit ;; ia64:AIX:*:*) - if [ -x /usr/bin/oslevel ] ; then + if test -x /usr/bin/oslevel ; then IBM_REV=`/usr/bin/oslevel` else IBM_REV="$UNAME_VERSION.$UNAME_RELEASE" @@ -579,7 +589,7 @@ EOF exit ;; *:AIX:2:3) if grep bos325 /usr/include/stdio.h >/dev/null 2>&1; then - eval "$set_cc_for_build" + set_cc_for_build sed 's/^ //' << EOF > "$dummy.c" #include <sys/systemcfg.h> @@ -610,7 +620,7 @@ EOF else IBM_ARCH=powerpc fi - if [ -x /usr/bin/lslpp ] ; then + if test -x /usr/bin/lslpp ; then IBM_REV=`/usr/bin/lslpp -Lqc bos.rte.libc | awk -F: '{ print $3 }' | sed s/[0-9]*$/0/` else @@ -645,7 +655,7 @@ EOF 9000/31?) HP_ARCH=m68000 ;; 9000/[34]??) HP_ARCH=m68k ;; 9000/[678][0-9][0-9]) - if [ -x /usr/bin/getconf ]; then + if test -x /usr/bin/getconf; then sc_cpu_version=`/usr/bin/getconf SC_CPU_VERSION 2>/dev/null` sc_kernel_bits=`/usr/bin/getconf SC_KERNEL_BITS 2>/dev/null` case "$sc_cpu_version" in @@ -659,8 +669,8 @@ EOF esac ;; esac fi - if [ "$HP_ARCH" = "" ]; then - eval "$set_cc_for_build" + if test "$HP_ARCH" = ""; then + set_cc_for_build sed 's/^ //' << EOF > "$dummy.c" #define _HPUX_SOURCE @@ -698,9 +708,9 @@ EOF test -z "$HP_ARCH" && HP_ARCH=hppa fi ;; esac - if [ "$HP_ARCH" = hppa2.0w ] + if test "$HP_ARCH" = hppa2.0w then - eval "$set_cc_for_build" + set_cc_for_build # hppa2.0w-hp-hpux* has a 64-bit kernel and a compiler generating # 32-bit code. hppa64-hp-hpux* has the same kernel and a compiler @@ -726,7 +736,7 @@ EOF echo ia64-hp-hpux"$HPUX_REV" exit ;; 3050*:HI-UX:*:*) - eval "$set_cc_for_build" + set_cc_for_build sed 's/^ //' << EOF > "$dummy.c" #include <unistd.h> int @@ -772,7 +782,7 @@ EOF echo hppa1.0-hp-osf exit ;; i*86:OSF1:*:*) - if [ -x /usr/sbin/sysversion ] ; then + if test -x /usr/sbin/sysversion ; then echo "$UNAME_MACHINE"-unknown-osf1mk else echo "$UNAME_MACHINE"-unknown-osf1 @@ -840,6 +850,17 @@ EOF *:BSD/OS:*:*) echo "$UNAME_MACHINE"-unknown-bsdi"$UNAME_RELEASE" exit ;; + arm:FreeBSD:*:*) + UNAME_PROCESSOR=`uname -p` + set_cc_for_build + if echo __ARM_PCS_VFP | $CC_FOR_BUILD -E - 2>/dev/null \ + | grep -q __ARM_PCS_VFP + then + echo "${UNAME_PROCESSOR}"-unknown-freebsd"`echo ${UNAME_RELEASE}|sed -e 's/[-(].*//'`"-gnueabi + else + echo "${UNAME_PROCESSOR}"-unknown-freebsd"`echo ${UNAME_RELEASE}|sed -e 's/[-(].*//'`"-gnueabihf + fi + exit ;; *:FreeBSD:*:*) UNAME_PROCESSOR=`/usr/bin/uname -p` case "$UNAME_PROCESSOR" in @@ -881,7 +902,7 @@ EOF echo "$UNAME_MACHINE"-pc-uwin exit ;; amd64:CYGWIN*:*:* | x86_64:CYGWIN*:*:*) - echo x86_64-unknown-cygwin + echo x86_64-pc-cygwin exit ;; prep*:SunOS:5.*:*) echo powerpcle-unknown-solaris2"`echo "$UNAME_RELEASE"|sed -e 's/[^.]*//'`" @@ -894,8 +915,8 @@ EOF # other systems with GNU libc and userland echo "$UNAME_MACHINE-unknown-`echo "$UNAME_SYSTEM" | sed 's,^[^/]*/,,' | tr "[:upper:]" "[:lower:]"``echo "$UNAME_RELEASE"|sed -e 's/[-(].*//'`-$LIBC" exit ;; - i*86:Minix:*:*) - echo "$UNAME_MACHINE"-pc-minix + *:Minix:*:*) + echo "$UNAME_MACHINE"-unknown-minix exit ;; aarch64:Linux:*:*) echo "$UNAME_MACHINE"-unknown-linux-"$LIBC" @@ -905,7 +926,7 @@ EOF echo "$UNAME_MACHINE"-unknown-linux-"$LIBC" exit ;; alpha:Linux:*:*) - case `sed -n '/^cpu model/s/^.*: \(.*\)/\1/p' < /proc/cpuinfo` in + case `sed -n '/^cpu model/s/^.*: \(.*\)/\1/p' /proc/cpuinfo 2>/dev/null` in EV5) UNAME_MACHINE=alphaev5 ;; EV56) UNAME_MACHINE=alphaev56 ;; PCA56) UNAME_MACHINE=alphapca56 ;; @@ -922,7 +943,7 @@ EOF echo "$UNAME_MACHINE"-unknown-linux-"$LIBC" exit ;; arm*:Linux:*:*) - eval "$set_cc_for_build" + set_cc_for_build if echo __ARM_EABI__ | $CC_FOR_BUILD -E - 2>/dev/null \ | grep -q __ARM_EABI__ then @@ -971,23 +992,51 @@ EOF echo "$UNAME_MACHINE"-unknown-linux-"$LIBC" exit ;; mips:Linux:*:* | mips64:Linux:*:*) - eval "$set_cc_for_build" + set_cc_for_build + IS_GLIBC=0 + test x"${LIBC}" = xgnu && IS_GLIBC=1 sed 's/^ //' << EOF > "$dummy.c" #undef CPU - #undef ${UNAME_MACHINE} - #undef ${UNAME_MACHINE}el + #undef mips + #undef mipsel + #undef mips64 + #undef mips64el + #if ${IS_GLIBC} && defined(_ABI64) + LIBCABI=gnuabi64 + #else + #if ${IS_GLIBC} && defined(_ABIN32) + LIBCABI=gnuabin32 + #else + LIBCABI=${LIBC} + #endif + #endif + + #if ${IS_GLIBC} && defined(__mips64) && defined(__mips_isa_rev) && __mips_isa_rev>=6 + CPU=mipsisa64r6 + #else + #if ${IS_GLIBC} && !defined(__mips64) && defined(__mips_isa_rev) && __mips_isa_rev>=6 + CPU=mipsisa32r6 + #else + #if defined(__mips64) + CPU=mips64 + #else + CPU=mips + #endif + #endif + #endif + #if defined(__MIPSEL__) || defined(__MIPSEL) || defined(_MIPSEL) || defined(MIPSEL) - CPU=${UNAME_MACHINE}el + MIPS_ENDIAN=el #else #if defined(__MIPSEB__) || defined(__MIPSEB) || defined(_MIPSEB) || defined(MIPSEB) - CPU=${UNAME_MACHINE} + MIPS_ENDIAN= #else - CPU= + MIPS_ENDIAN= #endif #endif EOF - eval "`$CC_FOR_BUILD -E "$dummy.c" 2>/dev/null | grep '^CPU'`" - test "x$CPU" != x && { echo "$CPU-unknown-linux-$LIBC"; exit; } + eval "`$CC_FOR_BUILD -E "$dummy.c" 2>/dev/null | grep '^CPU\|^MIPS_ENDIAN\|^LIBCABI'`" + test "x$CPU" != x && { echo "$CPU${MIPS_ENDIAN}-unknown-linux-$LIBCABI"; exit; } ;; mips64el:Linux:*:*) echo "$UNAME_MACHINE"-unknown-linux-"$LIBC" @@ -1046,7 +1095,17 @@ EOF echo "$UNAME_MACHINE"-dec-linux-"$LIBC" exit ;; x86_64:Linux:*:*) - echo "$UNAME_MACHINE"-pc-linux-"$LIBC" + set_cc_for_build + LIBCABI=$LIBC + if test "$CC_FOR_BUILD" != no_compiler_found; then + if (echo '#ifdef __ILP32__'; echo IS_X32; echo '#endif') | \ + (CCOPTS="" $CC_FOR_BUILD -E - 2>/dev/null) | \ + grep IS_X32 >/dev/null + then + LIBCABI="$LIBC"x32 + fi + fi + echo "$UNAME_MACHINE"-pc-linux-"$LIBCABI" exit ;; xtensa*:Linux:*:*) echo "$UNAME_MACHINE"-unknown-linux-"$LIBC" @@ -1100,7 +1159,7 @@ EOF *Pentium) UNAME_MACHINE=i586 ;; *Pent*|*Celeron) UNAME_MACHINE=i686 ;; esac - echo "$UNAME_MACHINE-unknown-sysv${UNAME_RELEASE}${UNAME_SYSTEM}{$UNAME_VERSION}" + echo "$UNAME_MACHINE-unknown-sysv${UNAME_RELEASE}${UNAME_SYSTEM}${UNAME_VERSION}" exit ;; i*86:*:3.2:*) if test -f /usr/options/cb.name; then @@ -1235,7 +1294,7 @@ EOF echo mips-sony-newsos6 exit ;; R[34]000:*System_V*:*:* | R4000:UNIX_SYSV:*:* | R*000:UNIX_SV:*:*) - if [ -d /usr/nec ]; then + if test -d /usr/nec; then echo mips-nec-sysv"$UNAME_RELEASE" else echo mips-unknown-sysv"$UNAME_RELEASE" @@ -1283,39 +1342,43 @@ EOF *:Rhapsody:*:*) echo "$UNAME_MACHINE"-apple-rhapsody"$UNAME_RELEASE" exit ;; + arm64:Darwin:*:*) + echo aarch64-apple-darwin"$UNAME_RELEASE" + exit ;; *:Darwin:*:*) - UNAME_PROCESSOR=`uname -p` || UNAME_PROCESSOR=unknown - eval "$set_cc_for_build" - if test "$UNAME_PROCESSOR" = unknown ; then - UNAME_PROCESSOR=powerpc + UNAME_PROCESSOR=`uname -p` + case $UNAME_PROCESSOR in + unknown) UNAME_PROCESSOR=powerpc ;; + esac + if command -v xcode-select > /dev/null 2> /dev/null && \ + ! xcode-select --print-path > /dev/null 2> /dev/null ; then + # Avoid executing cc if there is no toolchain installed as + # cc will be a stub that puts up a graphical alert + # prompting the user to install developer tools. + CC_FOR_BUILD=no_compiler_found + else + set_cc_for_build fi - if test "`echo "$UNAME_RELEASE" | sed -e 's/\..*//'`" -le 10 ; then - if [ "$CC_FOR_BUILD" != no_compiler_found ]; then - if (echo '#ifdef __LP64__'; echo IS_64BIT_ARCH; echo '#endif') | \ - (CCOPTS="" $CC_FOR_BUILD -E - 2>/dev/null) | \ - grep IS_64BIT_ARCH >/dev/null - then - case $UNAME_PROCESSOR in - i386) UNAME_PROCESSOR=x86_64 ;; - powerpc) UNAME_PROCESSOR=powerpc64 ;; - esac - fi - # On 10.4-10.6 one might compile for PowerPC via gcc -arch ppc - if (echo '#ifdef __POWERPC__'; echo IS_PPC; echo '#endif') | \ - (CCOPTS="" $CC_FOR_BUILD -E - 2>/dev/null) | \ - grep IS_PPC >/dev/null - then - UNAME_PROCESSOR=powerpc - fi + if test "$CC_FOR_BUILD" != no_compiler_found; then + if (echo '#ifdef __LP64__'; echo IS_64BIT_ARCH; echo '#endif') | \ + (CCOPTS="" $CC_FOR_BUILD -E - 2>/dev/null) | \ + grep IS_64BIT_ARCH >/dev/null + then + case $UNAME_PROCESSOR in + i386) UNAME_PROCESSOR=x86_64 ;; + powerpc) UNAME_PROCESSOR=powerpc64 ;; + esac + fi + # On 10.4-10.6 one might compile for PowerPC via gcc -arch ppc + if (echo '#ifdef __POWERPC__'; echo IS_PPC; echo '#endif') | \ + (CCOPTS="" $CC_FOR_BUILD -E - 2>/dev/null) | \ + grep IS_PPC >/dev/null + then + UNAME_PROCESSOR=powerpc fi elif test "$UNAME_PROCESSOR" = i386 ; then - # Avoid executing cc on OS X 10.9, as it ships with a stub - # that puts up a graphical alert prompting to install - # developer tools. Any system running Mac OS X 10.7 or - # later (Darwin 11 and later) is required to have a 64-bit - # processor. This is not true of the ARM version of Darwin - # that Apple uses in portable devices. - UNAME_PROCESSOR=x86_64 + # uname -m returns i386 or x86_64 + UNAME_PROCESSOR=$UNAME_MACHINE fi echo "$UNAME_PROCESSOR"-apple-darwin"$UNAME_RELEASE" exit ;; @@ -1358,6 +1421,7 @@ EOF # "uname -m" is not consistent, so use $cputype instead. 386 # is converted to i386 for consistency with other x86 # operating systems. + # shellcheck disable=SC2154 if test "$cputype" = 386; then UNAME_MACHINE=i386 else @@ -1414,8 +1478,148 @@ EOF amd64:Isilon\ OneFS:*:*) echo x86_64-unknown-onefs exit ;; + *:Unleashed:*:*) + echo "$UNAME_MACHINE"-unknown-unleashed"$UNAME_RELEASE" + exit ;; esac +# No uname command or uname output not recognized. +set_cc_for_build +cat > "$dummy.c" <<EOF +#ifdef _SEQUENT_ +#include <sys/types.h> +#include <sys/utsname.h> +#endif +#if defined(ultrix) || defined(_ultrix) || defined(__ultrix) || defined(__ultrix__) +#if defined (vax) || defined (__vax) || defined (__vax__) || defined(mips) || defined(__mips) || defined(__mips__) || defined(MIPS) || defined(__MIPS__) +#include <signal.h> +#if defined(_SIZE_T_) || defined(SIGLOST) +#include <sys/utsname.h> +#endif +#endif +#endif +main () +{ +#if defined (sony) +#if defined (MIPSEB) + /* BFD wants "bsd" instead of "newsos". Perhaps BFD should be changed, + I don't know.... */ + printf ("mips-sony-bsd\n"); exit (0); +#else +#include <sys/param.h> + printf ("m68k-sony-newsos%s\n", +#ifdef NEWSOS4 + "4" +#else + "" +#endif + ); exit (0); +#endif +#endif + +#if defined (NeXT) +#if !defined (__ARCHITECTURE__) +#define __ARCHITECTURE__ "m68k" +#endif + int version; + version=`(hostinfo | sed -n 's/.*NeXT Mach \([0-9]*\).*/\1/p') 2>/dev/null`; + if (version < 4) + printf ("%s-next-nextstep%d\n", __ARCHITECTURE__, version); + else + printf ("%s-next-openstep%d\n", __ARCHITECTURE__, version); + exit (0); +#endif + +#if defined (MULTIMAX) || defined (n16) +#if defined (UMAXV) + printf ("ns32k-encore-sysv\n"); exit (0); +#else +#if defined (CMU) + printf ("ns32k-encore-mach\n"); exit (0); +#else + printf ("ns32k-encore-bsd\n"); exit (0); +#endif +#endif +#endif + +#if defined (__386BSD__) + printf ("i386-pc-bsd\n"); exit (0); +#endif + +#if defined (sequent) +#if defined (i386) + printf ("i386-sequent-dynix\n"); exit (0); +#endif +#if defined (ns32000) + printf ("ns32k-sequent-dynix\n"); exit (0); +#endif +#endif + +#if defined (_SEQUENT_) + struct utsname un; + + uname(&un); + if (strncmp(un.version, "V2", 2) == 0) { + printf ("i386-sequent-ptx2\n"); exit (0); + } + if (strncmp(un.version, "V1", 2) == 0) { /* XXX is V1 correct? */ + printf ("i386-sequent-ptx1\n"); exit (0); + } + printf ("i386-sequent-ptx\n"); exit (0); +#endif + +#if defined (vax) +#if !defined (ultrix) +#include <sys/param.h> +#if defined (BSD) +#if BSD == 43 + printf ("vax-dec-bsd4.3\n"); exit (0); +#else +#if BSD == 199006 + printf ("vax-dec-bsd4.3reno\n"); exit (0); +#else + printf ("vax-dec-bsd\n"); exit (0); +#endif +#endif +#else + printf ("vax-dec-bsd\n"); exit (0); +#endif +#else +#if defined(_SIZE_T_) || defined(SIGLOST) + struct utsname un; + uname (&un); + printf ("vax-dec-ultrix%s\n", un.release); exit (0); +#else + printf ("vax-dec-ultrix\n"); exit (0); +#endif +#endif +#endif +#if defined(ultrix) || defined(_ultrix) || defined(__ultrix) || defined(__ultrix__) +#if defined(mips) || defined(__mips) || defined(__mips__) || defined(MIPS) || defined(__MIPS__) +#if defined(_SIZE_T_) || defined(SIGLOST) + struct utsname *un; + uname (&un); + printf ("mips-dec-ultrix%s\n", un.release); exit (0); +#else + printf ("mips-dec-ultrix\n"); exit (0); +#endif +#endif +#endif + +#if defined (alliant) && defined (i860) + printf ("i860-alliant-bsd\n"); exit (0); +#endif + + exit (1); +} +EOF + +$CC_FOR_BUILD -o "$dummy" "$dummy.c" 2>/dev/null && SYSTEM_NAME=`$dummy` && + { echo "$SYSTEM_NAME"; exit; } + +# Apollos put the system type in the environment. +test -d /usr/apollo && { echo "$ISP-apollo-$SYSTYPE"; exit; } + echo "$0: unable to guess system type" >&2 case "$UNAME_MACHINE:$UNAME_SYSTEM" in @@ -1438,6 +1642,12 @@ copies of config.guess and config.sub with the latest versions from: https://git.savannah.gnu.org/gitweb/?p=config.git;a=blob_plain;f=config.guess and https://git.savannah.gnu.org/gitweb/?p=config.git;a=blob_plain;f=config.sub +EOF + +year=`echo $timestamp | sed 's,-.*,,'` +# shellcheck disable=SC2003 +if test "`expr "\`date +%Y\`" - "$year"`" -lt 3 ; then + cat >&2 <<EOF If $0 has already been updated, send the following data and any information you think might be pertinent to config-patches@gnu.org to @@ -1465,6 +1675,7 @@ UNAME_RELEASE = "$UNAME_RELEASE" UNAME_SYSTEM = "$UNAME_SYSTEM" UNAME_VERSION = "$UNAME_VERSION" EOF +fi exit 1 @@ -1,8 +1,8 @@ #! /bin/sh # Configuration validation subroutine script. -# Copyright 1992-2018 Free Software Foundation, Inc. +# Copyright 1992-2020 Free Software Foundation, Inc. -timestamp='2018-03-08' +timestamp='2020-08-17' # This file is free software; you can redistribute it and/or modify it # under the terms of the GNU General Public License as published by @@ -67,7 +67,7 @@ Report bugs and patches to <config-patches@gnu.org>." version="\ GNU config.sub ($timestamp) -Copyright 1992-2018 Free Software Foundation, Inc. +Copyright 1992-2020 Free Software Foundation, Inc. This is free software; see the source for copying conditions. There is NO warranty; not even for MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE." @@ -89,7 +89,7 @@ while test $# -gt 0 ; do - ) # Use stdin as input. break ;; -* ) - echo "$me: invalid option $1$help" + echo "$me: invalid option $1$help" >&2 exit 1 ;; *local*) @@ -110,1223 +110,1167 @@ case $# in exit 1;; esac -# Separate what the user gave into CPU-COMPANY and OS or KERNEL-OS (if any). -# Here we must recognize all the valid KERNEL-OS combinations. -maybe_os=`echo "$1" | sed 's/^\(.*\)-\([^-]*-[^-]*\)$/\2/'` -case $maybe_os in - nto-qnx* | linux-gnu* | linux-android* | linux-dietlibc | linux-newlib* | \ - linux-musl* | linux-uclibc* | uclinux-uclibc* | uclinux-gnu* | kfreebsd*-gnu* | \ - knetbsd*-gnu* | netbsd*-gnu* | netbsd*-eabi* | \ - kopensolaris*-gnu* | cloudabi*-eabi* | \ - storm-chaos* | os2-emx* | rtmk-nova*) - os=-$maybe_os - basic_machine=`echo "$1" | sed 's/^\(.*\)-\([^-]*-[^-]*\)$/\1/'` - ;; - android-linux) - os=-linux-android - basic_machine=`echo "$1" | sed 's/^\(.*\)-\([^-]*-[^-]*\)$/\1/'`-unknown - ;; - *) - basic_machine=`echo "$1" | sed 's/-[^-]*$//'` - if [ "$basic_machine" != "$1" ] - then os=`echo "$1" | sed 's/.*-/-/'` - else os=; fi - ;; -esac +# Split fields of configuration type +# shellcheck disable=SC2162 +IFS="-" read field1 field2 field3 field4 <<EOF +$1 +EOF -### Let's recognize common machines as not being operating systems so -### that things like config.sub decstation-3100 work. We also -### recognize some manufacturers as not being operating systems, so we -### can provide default operating systems below. -case $os in - -sun*os*) - # Prevent following clause from handling this invalid input. - ;; - -dec* | -mips* | -sequent* | -encore* | -pc532* | -sgi* | -sony* | \ - -att* | -7300* | -3300* | -delta* | -motorola* | -sun[234]* | \ - -unicom* | -ibm* | -next | -hp | -isi* | -apollo | -altos* | \ - -convergent* | -ncr* | -news | -32* | -3600* | -3100* | -hitachi* |\ - -c[123]* | -convex* | -sun | -crds | -omron* | -dg | -ultra | -tti* | \ - -harris | -dolphin | -highlevel | -gould | -cbm | -ns | -masscomp | \ - -apple | -axis | -knuth | -cray | -microblaze*) - os= - basic_machine=$1 - ;; - -bluegene*) - os=-cnk - ;; - -sim | -cisco | -oki | -wec | -winbond) - os= - basic_machine=$1 - ;; - -scout) - ;; - -wrs) - os=-vxworks - basic_machine=$1 - ;; - -chorusos*) - os=-chorusos - basic_machine=$1 - ;; - -chorusrdb) - os=-chorusrdb - basic_machine=$1 - ;; - -hiux*) - os=-hiuxwe2 - ;; - -sco6) - os=-sco5v6 - basic_machine=`echo "$1" | sed -e 's/86-.*/86-pc/'` - ;; - -sco5) - os=-sco3.2v5 - basic_machine=`echo "$1" | sed -e 's/86-.*/86-pc/'` - ;; - -sco4) - os=-sco3.2v4 - basic_machine=`echo "$1" | sed -e 's/86-.*/86-pc/'` - ;; - -sco3.2.[4-9]*) - os=`echo $os | sed -e 's/sco3.2./sco3.2v/'` - basic_machine=`echo "$1" | sed -e 's/86-.*/86-pc/'` - ;; - -sco3.2v[4-9]*) - # Don't forget version if it is 3.2v4 or newer. - basic_machine=`echo "$1" | sed -e 's/86-.*/86-pc/'` - ;; - -sco5v6*) - # Don't forget version if it is 3.2v4 or newer. - basic_machine=`echo "$1" | sed -e 's/86-.*/86-pc/'` - ;; - -sco*) - os=-sco3.2v2 - basic_machine=`echo "$1" | sed -e 's/86-.*/86-pc/'` - ;; - -udk*) - basic_machine=`echo "$1" | sed -e 's/86-.*/86-pc/'` - ;; - -isc) - os=-isc2.2 - basic_machine=`echo "$1" | sed -e 's/86-.*/86-pc/'` - ;; - -clix*) - basic_machine=clipper-intergraph - ;; - -isc*) - basic_machine=`echo "$1" | sed -e 's/86-.*/86-pc/'` - ;; - -lynx*178) - os=-lynxos178 - ;; - -lynx*5) - os=-lynxos5 +# Separate into logical components for further validation +case $1 in + *-*-*-*-*) + echo Invalid configuration \`"$1"\': more than four components >&2 + exit 1 ;; - -lynx*) - os=-lynxos + *-*-*-*) + basic_machine=$field1-$field2 + basic_os=$field3-$field4 ;; - -ptx*) - basic_machine=`echo "$1" | sed -e 's/86-.*/86-sequent/'` + *-*-*) + # Ambiguous whether COMPANY is present, or skipped and KERNEL-OS is two + # parts + maybe_os=$field2-$field3 + case $maybe_os in + nto-qnx* | linux-* | uclinux-uclibc* \ + | uclinux-gnu* | kfreebsd*-gnu* | knetbsd*-gnu* | netbsd*-gnu* \ + | netbsd*-eabi* | kopensolaris*-gnu* | cloudabi*-eabi* \ + | storm-chaos* | os2-emx* | rtmk-nova*) + basic_machine=$field1 + basic_os=$maybe_os + ;; + android-linux) + basic_machine=$field1-unknown + basic_os=linux-android + ;; + *) + basic_machine=$field1-$field2 + basic_os=$field3 + ;; + esac ;; - -psos*) - os=-psos + *-*) + # A lone config we happen to match not fitting any pattern + case $field1-$field2 in + decstation-3100) + basic_machine=mips-dec + basic_os= + ;; + *-*) + # Second component is usually, but not always the OS + case $field2 in + # Prevent following clause from handling this valid os + sun*os*) + basic_machine=$field1 + basic_os=$field2 + ;; + # Manufacturers + dec* | mips* | sequent* | encore* | pc533* | sgi* | sony* \ + | att* | 7300* | 3300* | delta* | motorola* | sun[234]* \ + | unicom* | ibm* | next | hp | isi* | apollo | altos* \ + | convergent* | ncr* | news | 32* | 3600* | 3100* \ + | hitachi* | c[123]* | convex* | sun | crds | omron* | dg \ + | ultra | tti* | harris | dolphin | highlevel | gould \ + | cbm | ns | masscomp | apple | axis | knuth | cray \ + | microblaze* | sim | cisco \ + | oki | wec | wrs | winbond) + basic_machine=$field1-$field2 + basic_os= + ;; + *) + basic_machine=$field1 + basic_os=$field2 + ;; + esac + ;; + esac ;; - -mint | -mint[0-9]*) - basic_machine=m68k-atari - os=-mint + *) + # Convert single-component short-hands not valid as part of + # multi-component configurations. + case $field1 in + 386bsd) + basic_machine=i386-pc + basic_os=bsd + ;; + a29khif) + basic_machine=a29k-amd + basic_os=udi + ;; + adobe68k) + basic_machine=m68010-adobe + basic_os=scout + ;; + alliant) + basic_machine=fx80-alliant + basic_os= + ;; + altos | altos3068) + basic_machine=m68k-altos + basic_os= + ;; + am29k) + basic_machine=a29k-none + basic_os=bsd + ;; + amdahl) + basic_machine=580-amdahl + basic_os=sysv + ;; + amiga) + basic_machine=m68k-unknown + basic_os= + ;; + amigaos | amigados) + basic_machine=m68k-unknown + basic_os=amigaos + ;; + amigaunix | amix) + basic_machine=m68k-unknown + basic_os=sysv4 + ;; + apollo68) + basic_machine=m68k-apollo + basic_os=sysv + ;; + apollo68bsd) + basic_machine=m68k-apollo + basic_os=bsd + ;; + aros) + basic_machine=i386-pc + basic_os=aros + ;; + aux) + basic_machine=m68k-apple + basic_os=aux + ;; + balance) + basic_machine=ns32k-sequent + basic_os=dynix + ;; + blackfin) + basic_machine=bfin-unknown + basic_os=linux + ;; + cegcc) + basic_machine=arm-unknown + basic_os=cegcc + ;; + convex-c1) + basic_machine=c1-convex + basic_os=bsd + ;; + convex-c2) + basic_machine=c2-convex + basic_os=bsd + ;; + convex-c32) + basic_machine=c32-convex + basic_os=bsd + ;; + convex-c34) + basic_machine=c34-convex + basic_os=bsd + ;; + convex-c38) + basic_machine=c38-convex + basic_os=bsd + ;; + cray) + basic_machine=j90-cray + basic_os=unicos + ;; + crds | unos) + basic_machine=m68k-crds + basic_os= + ;; + da30) + basic_machine=m68k-da30 + basic_os= + ;; + decstation | pmax | pmin | dec3100 | decstatn) + basic_machine=mips-dec + basic_os= + ;; + delta88) + basic_machine=m88k-motorola + basic_os=sysv3 + ;; + dicos) + basic_machine=i686-pc + basic_os=dicos + ;; + djgpp) + basic_machine=i586-pc + basic_os=msdosdjgpp + ;; + ebmon29k) + basic_machine=a29k-amd + basic_os=ebmon + ;; + es1800 | OSE68k | ose68k | ose | OSE) + basic_machine=m68k-ericsson + basic_os=ose + ;; + gmicro) + basic_machine=tron-gmicro + basic_os=sysv + ;; + go32) + basic_machine=i386-pc + basic_os=go32 + ;; + h8300hms) + basic_machine=h8300-hitachi + basic_os=hms + ;; + h8300xray) + basic_machine=h8300-hitachi + basic_os=xray + ;; + h8500hms) + basic_machine=h8500-hitachi + basic_os=hms + ;; + harris) + basic_machine=m88k-harris + basic_os=sysv3 + ;; + hp300 | hp300hpux) + basic_machine=m68k-hp + basic_os=hpux + ;; + hp300bsd) + basic_machine=m68k-hp + basic_os=bsd + ;; + hppaosf) + basic_machine=hppa1.1-hp + basic_os=osf + ;; + hppro) + basic_machine=hppa1.1-hp + basic_os=proelf + ;; + i386mach) + basic_machine=i386-mach + basic_os=mach + ;; + isi68 | isi) + basic_machine=m68k-isi + basic_os=sysv + ;; + m68knommu) + basic_machine=m68k-unknown + basic_os=linux + ;; + magnum | m3230) + basic_machine=mips-mips + basic_os=sysv + ;; + merlin) + basic_machine=ns32k-utek + basic_os=sysv + ;; + mingw64) + basic_machine=x86_64-pc + basic_os=mingw64 + ;; + mingw32) + basic_machine=i686-pc + basic_os=mingw32 + ;; + mingw32ce) + basic_machine=arm-unknown + basic_os=mingw32ce + ;; + monitor) + basic_machine=m68k-rom68k + basic_os=coff + ;; + morphos) + basic_machine=powerpc-unknown + basic_os=morphos + ;; + moxiebox) + basic_machine=moxie-unknown + basic_os=moxiebox + ;; + msdos) + basic_machine=i386-pc + basic_os=msdos + ;; + msys) + basic_machine=i686-pc + basic_os=msys + ;; + mvs) + basic_machine=i370-ibm + basic_os=mvs + ;; + nacl) + basic_machine=le32-unknown + basic_os=nacl + ;; + ncr3000) + basic_machine=i486-ncr + basic_os=sysv4 + ;; + netbsd386) + basic_machine=i386-pc + basic_os=netbsd + ;; + netwinder) + basic_machine=armv4l-rebel + basic_os=linux + ;; + news | news700 | news800 | news900) + basic_machine=m68k-sony + basic_os=newsos + ;; + news1000) + basic_machine=m68030-sony + basic_os=newsos + ;; + necv70) + basic_machine=v70-nec + basic_os=sysv + ;; + nh3000) + basic_machine=m68k-harris + basic_os=cxux + ;; + nh[45]000) + basic_machine=m88k-harris + basic_os=cxux + ;; + nindy960) + basic_machine=i960-intel + basic_os=nindy + ;; + mon960) + basic_machine=i960-intel + basic_os=mon960 + ;; + nonstopux) + basic_machine=mips-compaq + basic_os=nonstopux + ;; + os400) + basic_machine=powerpc-ibm + basic_os=os400 + ;; + OSE68000 | ose68000) + basic_machine=m68000-ericsson + basic_os=ose + ;; + os68k) + basic_machine=m68k-none + basic_os=os68k + ;; + paragon) + basic_machine=i860-intel + basic_os=osf + ;; + parisc) + basic_machine=hppa-unknown + basic_os=linux + ;; + psp) + basic_machine=mipsallegrexel-sony + basic_os=psp + ;; + pw32) + basic_machine=i586-unknown + basic_os=pw32 + ;; + rdos | rdos64) + basic_machine=x86_64-pc + basic_os=rdos + ;; + rdos32) + basic_machine=i386-pc + basic_os=rdos + ;; + rom68k) + basic_machine=m68k-rom68k + basic_os=coff + ;; + sa29200) + basic_machine=a29k-amd + basic_os=udi + ;; + sei) + basic_machine=mips-sei + basic_os=seiux + ;; + sequent) + basic_machine=i386-sequent + basic_os= + ;; + sps7) + basic_machine=m68k-bull + basic_os=sysv2 + ;; + st2000) + basic_machine=m68k-tandem + basic_os= + ;; + stratus) + basic_machine=i860-stratus + basic_os=sysv4 + ;; + sun2) + basic_machine=m68000-sun + basic_os= + ;; + sun2os3) + basic_machine=m68000-sun + basic_os=sunos3 + ;; + sun2os4) + basic_machine=m68000-sun + basic_os=sunos4 + ;; + sun3) + basic_machine=m68k-sun + basic_os= + ;; + sun3os3) + basic_machine=m68k-sun + basic_os=sunos3 + ;; + sun3os4) + basic_machine=m68k-sun + basic_os=sunos4 + ;; + sun4) + basic_machine=sparc-sun + basic_os= + ;; + sun4os3) + basic_machine=sparc-sun + basic_os=sunos3 + ;; + sun4os4) + basic_machine=sparc-sun + basic_os=sunos4 + ;; + sun4sol2) + basic_machine=sparc-sun + basic_os=solaris2 + ;; + sun386 | sun386i | roadrunner) + basic_machine=i386-sun + basic_os= + ;; + sv1) + basic_machine=sv1-cray + basic_os=unicos + ;; + symmetry) + basic_machine=i386-sequent + basic_os=dynix + ;; + t3e) + basic_machine=alphaev5-cray + basic_os=unicos + ;; + t90) + basic_machine=t90-cray + basic_os=unicos + ;; + toad1) + basic_machine=pdp10-xkl + basic_os=tops20 + ;; + tpf) + basic_machine=s390x-ibm + basic_os=tpf + ;; + udi29k) + basic_machine=a29k-amd + basic_os=udi + ;; + ultra3) + basic_machine=a29k-nyu + basic_os=sym1 + ;; + v810 | necv810) + basic_machine=v810-nec + basic_os=none + ;; + vaxv) + basic_machine=vax-dec + basic_os=sysv + ;; + vms) + basic_machine=vax-dec + basic_os=vms + ;; + vsta) + basic_machine=i386-pc + basic_os=vsta + ;; + vxworks960) + basic_machine=i960-wrs + basic_os=vxworks + ;; + vxworks68) + basic_machine=m68k-wrs + basic_os=vxworks + ;; + vxworks29k) + basic_machine=a29k-wrs + basic_os=vxworks + ;; + xbox) + basic_machine=i686-pc + basic_os=mingw32 + ;; + ymp) + basic_machine=ymp-cray + basic_os=unicos + ;; + *) + basic_machine=$1 + basic_os= + ;; + esac ;; esac -# Decode aliases for certain CPU-COMPANY combinations. +# Decode 1-component or ad-hoc basic machines case $basic_machine in - # Recognize the basic CPU types without company name. - # Some are omitted here because they have special meanings below. - 1750a | 580 \ - | a29k \ - | aarch64 | aarch64_be \ - | alpha | alphaev[4-8] | alphaev56 | alphaev6[78] | alphapca5[67] \ - | alpha64 | alpha64ev[4-8] | alpha64ev56 | alpha64ev6[78] | alpha64pca5[67] \ - | am33_2.0 \ - | arc | arceb \ - | arm | arm[bl]e | arme[lb] | armv[2-8] | armv[3-8][lb] | armv7[arm] \ - | avr | avr32 \ - | ba \ - | be32 | be64 \ - | bfin \ - | c4x | c8051 | clipper \ - | d10v | d30v | dlx | dsp16xx \ - | e2k | epiphany \ - | fido | fr30 | frv | ft32 \ - | h8300 | h8500 | hppa | hppa1.[01] | hppa2.0 | hppa2.0[nw] | hppa64 \ - | hexagon \ - | i370 | i860 | i960 | ia16 | ia64 \ - | ip2k | iq2000 \ - | k1om \ - | le32 | le64 \ - | lm32 \ - | m32c | m32r | m32rle | m68000 | m68k | m88k \ - | maxq | mb | microblaze | microblazeel | mcore | mep | metag \ - | mips | mipsbe | mipseb | mipsel | mipsle \ - | mips16 \ - | mips64 | mips64el \ - | mips64octeon | mips64octeonel \ - | mips64orion | mips64orionel \ - | mips64r5900 | mips64r5900el \ - | mips64vr | mips64vrel \ - | mips64vr4100 | mips64vr4100el \ - | mips64vr4300 | mips64vr4300el \ - | mips64vr5000 | mips64vr5000el \ - | mips64vr5900 | mips64vr5900el \ - | mipsisa32 | mipsisa32el \ - | mipsisa32r2 | mipsisa32r2el \ - | mipsisa32r6 | mipsisa32r6el \ - | mipsisa64 | mipsisa64el \ - | mipsisa64r2 | mipsisa64r2el \ - | mipsisa64r6 | mipsisa64r6el \ - | mipsisa64sb1 | mipsisa64sb1el \ - | mipsisa64sr71k | mipsisa64sr71kel \ - | mipsr5900 | mipsr5900el \ - | mipstx39 | mipstx39el \ - | mn10200 | mn10300 \ - | moxie \ - | mt \ - | msp430 \ - | nds32 | nds32le | nds32be \ - | nios | nios2 | nios2eb | nios2el \ - | ns16k | ns32k \ - | open8 | or1k | or1knd | or32 \ - | pdp10 | pj | pjl \ - | powerpc | powerpc64 | powerpc64le | powerpcle \ - | pru \ - | pyramid \ - | riscv32 | riscv64 \ - | rl78 | rx \ - | score \ - | sh | sh[1234] | sh[24]a | sh[24]aeb | sh[23]e | sh[234]eb | sheb | shbe | shle | sh[1234]le | sh3ele \ - | sh64 | sh64le \ - | sparc | sparc64 | sparc64b | sparc64v | sparc86x | sparclet | sparclite \ - | sparcv8 | sparcv9 | sparcv9b | sparcv9v \ - | spu \ - | tahoe | tic4x | tic54x | tic55x | tic6x | tic80 | tron \ - | ubicom32 \ - | v850 | v850e | v850e1 | v850e2 | v850es | v850e2v3 \ - | visium \ - | wasm32 \ - | x86 | xc16x | xstormy16 | xtensa \ - | z8k | z80) - basic_machine=$basic_machine-unknown - ;; - c54x) - basic_machine=tic54x-unknown - ;; - c55x) - basic_machine=tic55x-unknown - ;; - c6x) - basic_machine=tic6x-unknown - ;; - leon|leon[3-9]) - basic_machine=sparc-$basic_machine - ;; - m6811 | m68hc11 | m6812 | m68hc12 | m68hcs12x | nvptx | picochip) - basic_machine=$basic_machine-unknown - os=-none + # Here we handle the default manufacturer of certain CPU types. It is in + # some cases the only manufacturer, in others, it is the most popular. + w89k) + cpu=hppa1.1 + vendor=winbond ;; - m88110 | m680[12346]0 | m683?2 | m68360 | m5200 | v70 | w65) + op50n) + cpu=hppa1.1 + vendor=oki ;; - ms1) - basic_machine=mt-unknown + op60c) + cpu=hppa1.1 + vendor=oki ;; - - strongarm | thumb | xscale) - basic_machine=arm-unknown + ibm*) + cpu=i370 + vendor=ibm ;; - xgate) - basic_machine=$basic_machine-unknown - os=-none + orion105) + cpu=clipper + vendor=highlevel ;; - xscaleeb) - basic_machine=armeb-unknown + mac | mpw | mac-mpw) + cpu=m68k + vendor=apple ;; - - xscaleel) - basic_machine=armel-unknown + pmac | pmac-mpw) + cpu=powerpc + vendor=apple ;; - # We use `pc' rather than `unknown' - # because (1) that's what they normally are, and - # (2) the word "unknown" tends to confuse beginning users. - i*86 | x86_64) - basic_machine=$basic_machine-pc - ;; - # Object if more than one company name word. - *-*-*) - echo Invalid configuration \`"$1"\': machine \`"$basic_machine"\' not recognized 1>&2 - exit 1 - ;; - # Recognize the basic CPU types with company name. - 580-* \ - | a29k-* \ - | aarch64-* | aarch64_be-* \ - | alpha-* | alphaev[4-8]-* | alphaev56-* | alphaev6[78]-* \ - | alpha64-* | alpha64ev[4-8]-* | alpha64ev56-* | alpha64ev6[78]-* \ - | alphapca5[67]-* | alpha64pca5[67]-* | arc-* | arceb-* \ - | arm-* | armbe-* | armle-* | armeb-* | armv*-* \ - | avr-* | avr32-* \ - | ba-* \ - | be32-* | be64-* \ - | bfin-* | bs2000-* \ - | c[123]* | c30-* | [cjt]90-* | c4x-* \ - | c8051-* | clipper-* | craynv-* | cydra-* \ - | d10v-* | d30v-* | dlx-* \ - | e2k-* | elxsi-* \ - | f30[01]-* | f700-* | fido-* | fr30-* | frv-* | fx80-* \ - | h8300-* | h8500-* \ - | hppa-* | hppa1.[01]-* | hppa2.0-* | hppa2.0[nw]-* | hppa64-* \ - | hexagon-* \ - | i*86-* | i860-* | i960-* | ia16-* | ia64-* \ - | ip2k-* | iq2000-* \ - | k1om-* \ - | le32-* | le64-* \ - | lm32-* \ - | m32c-* | m32r-* | m32rle-* \ - | m68000-* | m680[012346]0-* | m68360-* | m683?2-* | m68k-* \ - | m88110-* | m88k-* | maxq-* | mcore-* | metag-* \ - | microblaze-* | microblazeel-* \ - | mips-* | mipsbe-* | mipseb-* | mipsel-* | mipsle-* \ - | mips16-* \ - | mips64-* | mips64el-* \ - | mips64octeon-* | mips64octeonel-* \ - | mips64orion-* | mips64orionel-* \ - | mips64r5900-* | mips64r5900el-* \ - | mips64vr-* | mips64vrel-* \ - | mips64vr4100-* | mips64vr4100el-* \ - | mips64vr4300-* | mips64vr4300el-* \ - | mips64vr5000-* | mips64vr5000el-* \ - | mips64vr5900-* | mips64vr5900el-* \ - | mipsisa32-* | mipsisa32el-* \ - | mipsisa32r2-* | mipsisa32r2el-* \ - | mipsisa32r6-* | mipsisa32r6el-* \ - | mipsisa64-* | mipsisa64el-* \ - | mipsisa64r2-* | mipsisa64r2el-* \ - | mipsisa64r6-* | mipsisa64r6el-* \ - | mipsisa64sb1-* | mipsisa64sb1el-* \ - | mipsisa64sr71k-* | mipsisa64sr71kel-* \ - | mipsr5900-* | mipsr5900el-* \ - | mipstx39-* | mipstx39el-* \ - | mmix-* \ - | mt-* \ - | msp430-* \ - | nds32-* | nds32le-* | nds32be-* \ - | nios-* | nios2-* | nios2eb-* | nios2el-* \ - | none-* | np1-* | ns16k-* | ns32k-* \ - | open8-* \ - | or1k*-* \ - | orion-* \ - | pdp10-* | pdp11-* | pj-* | pjl-* | pn-* | power-* \ - | powerpc-* | powerpc64-* | powerpc64le-* | powerpcle-* \ - | pru-* \ - | pyramid-* \ - | riscv32-* | riscv64-* \ - | rl78-* | romp-* | rs6000-* | rx-* \ - | sh-* | sh[1234]-* | sh[24]a-* | sh[24]aeb-* | sh[23]e-* | sh[34]eb-* | sheb-* | shbe-* \ - | shle-* | sh[1234]le-* | sh3ele-* | sh64-* | sh64le-* \ - | sparc-* | sparc64-* | sparc64b-* | sparc64v-* | sparc86x-* | sparclet-* \ - | sparclite-* \ - | sparcv8-* | sparcv9-* | sparcv9b-* | sparcv9v-* | sv1-* | sx*-* \ - | tahoe-* \ - | tic30-* | tic4x-* | tic54x-* | tic55x-* | tic6x-* | tic80-* \ - | tile*-* \ - | tron-* \ - | ubicom32-* \ - | v850-* | v850e-* | v850e1-* | v850es-* | v850e2-* | v850e2v3-* \ - | vax-* \ - | visium-* \ - | wasm32-* \ - | we32k-* \ - | x86-* | x86_64-* | xc16x-* | xps100-* \ - | xstormy16-* | xtensa*-* \ - | ymp-* \ - | z8k-* | z80-*) - ;; - # Recognize the basic CPU types without company name, with glob match. - xtensa*) - basic_machine=$basic_machine-unknown - ;; # Recognize the various machine names and aliases which stand # for a CPU type and a company and sometimes even an OS. - 386bsd) - basic_machine=i386-pc - os=-bsd - ;; 3b1 | 7300 | 7300-att | att-7300 | pc7300 | safari | unixpc) - basic_machine=m68000-att + cpu=m68000 + vendor=att ;; 3b*) - basic_machine=we32k-att - ;; - a29khif) - basic_machine=a29k-amd - os=-udi - ;; - abacus) - basic_machine=abacus-unknown - ;; - adobe68k) - basic_machine=m68010-adobe - os=-scout - ;; - alliant | fx80) - basic_machine=fx80-alliant - ;; - altos | altos3068) - basic_machine=m68k-altos - ;; - am29k) - basic_machine=a29k-none - os=-bsd - ;; - amd64) - basic_machine=x86_64-pc - ;; - amd64-*) - basic_machine=x86_64-`echo "$basic_machine" | sed 's/^[^-]*-//'` - ;; - amdahl) - basic_machine=580-amdahl - os=-sysv - ;; - amiga | amiga-*) - basic_machine=m68k-unknown - ;; - amigaos | amigados) - basic_machine=m68k-unknown - os=-amigaos - ;; - amigaunix | amix) - basic_machine=m68k-unknown - os=-sysv4 - ;; - apollo68) - basic_machine=m68k-apollo - os=-sysv - ;; - apollo68bsd) - basic_machine=m68k-apollo - os=-bsd - ;; - aros) - basic_machine=i386-pc - os=-aros - ;; - asmjs) - basic_machine=asmjs-unknown - ;; - aux) - basic_machine=m68k-apple - os=-aux - ;; - balance) - basic_machine=ns32k-sequent - os=-dynix - ;; - blackfin) - basic_machine=bfin-unknown - os=-linux - ;; - blackfin-*) - basic_machine=bfin-`echo "$basic_machine" | sed 's/^[^-]*-//'` - os=-linux + cpu=we32k + vendor=att ;; bluegene*) - basic_machine=powerpc-ibm - os=-cnk - ;; - c54x-*) - basic_machine=tic54x-`echo "$basic_machine" | sed 's/^[^-]*-//'` - ;; - c55x-*) - basic_machine=tic55x-`echo "$basic_machine" | sed 's/^[^-]*-//'` - ;; - c6x-*) - basic_machine=tic6x-`echo "$basic_machine" | sed 's/^[^-]*-//'` - ;; - c90) - basic_machine=c90-cray - os=-unicos - ;; - cegcc) - basic_machine=arm-unknown - os=-cegcc - ;; - convex-c1) - basic_machine=c1-convex - os=-bsd - ;; - convex-c2) - basic_machine=c2-convex - os=-bsd - ;; - convex-c32) - basic_machine=c32-convex - os=-bsd - ;; - convex-c34) - basic_machine=c34-convex - os=-bsd - ;; - convex-c38) - basic_machine=c38-convex - os=-bsd - ;; - cray | j90) - basic_machine=j90-cray - os=-unicos - ;; - craynv) - basic_machine=craynv-cray - os=-unicosmp - ;; - cr16 | cr16-*) - basic_machine=cr16-unknown - os=-elf - ;; - crds | unos) - basic_machine=m68k-crds - ;; - crisv32 | crisv32-* | etraxfs*) - basic_machine=crisv32-axis - ;; - cris | cris-* | etrax*) - basic_machine=cris-axis - ;; - crx) - basic_machine=crx-unknown - os=-elf - ;; - da30 | da30-*) - basic_machine=m68k-da30 - ;; - decstation | decstation-3100 | pmax | pmax-* | pmin | dec3100 | decstatn) - basic_machine=mips-dec + cpu=powerpc + vendor=ibm + basic_os=cnk ;; decsystem10* | dec10*) - basic_machine=pdp10-dec - os=-tops10 + cpu=pdp10 + vendor=dec + basic_os=tops10 ;; decsystem20* | dec20*) - basic_machine=pdp10-dec - os=-tops20 + cpu=pdp10 + vendor=dec + basic_os=tops20 ;; delta | 3300 | motorola-3300 | motorola-delta \ | 3300-motorola | delta-motorola) - basic_machine=m68k-motorola - ;; - delta88) - basic_machine=m88k-motorola - os=-sysv3 - ;; - dicos) - basic_machine=i686-pc - os=-dicos - ;; - djgpp) - basic_machine=i586-pc - os=-msdosdjgpp - ;; - dpx20 | dpx20-*) - basic_machine=rs6000-bull - os=-bosx + cpu=m68k + vendor=motorola ;; dpx2*) - basic_machine=m68k-bull - os=-sysv3 - ;; - e500v[12]) - basic_machine=powerpc-unknown - os=$os"spe" - ;; - e500v[12]-*) - basic_machine=powerpc-`echo "$basic_machine" | sed 's/^[^-]*-//'` - os=$os"spe" - ;; - ebmon29k) - basic_machine=a29k-amd - os=-ebmon - ;; - elxsi) - basic_machine=elxsi-elxsi - os=-bsd + cpu=m68k + vendor=bull + basic_os=sysv3 ;; encore | umax | mmax) - basic_machine=ns32k-encore + cpu=ns32k + vendor=encore ;; - es1800 | OSE68k | ose68k | ose | OSE) - basic_machine=m68k-ericsson - os=-ose + elxsi) + cpu=elxsi + vendor=elxsi + basic_os=${basic_os:-bsd} ;; fx2800) - basic_machine=i860-alliant + cpu=i860 + vendor=alliant ;; genix) - basic_machine=ns32k-ns - ;; - gmicro) - basic_machine=tron-gmicro - os=-sysv - ;; - go32) - basic_machine=i386-pc - os=-go32 + cpu=ns32k + vendor=ns ;; h3050r* | hiux*) - basic_machine=hppa1.1-hitachi - os=-hiuxwe2 - ;; - h8300hms) - basic_machine=h8300-hitachi - os=-hms - ;; - h8300xray) - basic_machine=h8300-hitachi - os=-xray - ;; - h8500hms) - basic_machine=h8500-hitachi - os=-hms - ;; - harris) - basic_machine=m88k-harris - os=-sysv3 - ;; - hp300-*) - basic_machine=m68k-hp - ;; - hp300bsd) - basic_machine=m68k-hp - os=-bsd - ;; - hp300hpux) - basic_machine=m68k-hp - os=-hpux + cpu=hppa1.1 + vendor=hitachi + basic_os=hiuxwe2 ;; hp3k9[0-9][0-9] | hp9[0-9][0-9]) - basic_machine=hppa1.0-hp + cpu=hppa1.0 + vendor=hp ;; hp9k2[0-9][0-9] | hp9k31[0-9]) - basic_machine=m68000-hp + cpu=m68000 + vendor=hp ;; hp9k3[2-9][0-9]) - basic_machine=m68k-hp + cpu=m68k + vendor=hp ;; hp9k6[0-9][0-9] | hp6[0-9][0-9]) - basic_machine=hppa1.0-hp + cpu=hppa1.0 + vendor=hp ;; hp9k7[0-79][0-9] | hp7[0-79][0-9]) - basic_machine=hppa1.1-hp + cpu=hppa1.1 + vendor=hp ;; hp9k78[0-9] | hp78[0-9]) # FIXME: really hppa2.0-hp - basic_machine=hppa1.1-hp + cpu=hppa1.1 + vendor=hp ;; hp9k8[67]1 | hp8[67]1 | hp9k80[24] | hp80[24] | hp9k8[78]9 | hp8[78]9 | hp9k893 | hp893) # FIXME: really hppa2.0-hp - basic_machine=hppa1.1-hp + cpu=hppa1.1 + vendor=hp ;; hp9k8[0-9][13679] | hp8[0-9][13679]) - basic_machine=hppa1.1-hp + cpu=hppa1.1 + vendor=hp ;; hp9k8[0-9][0-9] | hp8[0-9][0-9]) - basic_machine=hppa1.0-hp - ;; - hppaosf) - basic_machine=hppa1.1-hp - os=-osf - ;; - hppro) - basic_machine=hppa1.1-hp - os=-proelf - ;; - i370-ibm* | ibm*) - basic_machine=i370-ibm + cpu=hppa1.0 + vendor=hp ;; i*86v32) - basic_machine=`echo "$1" | sed -e 's/86.*/86-pc/'` - os=-sysv32 + cpu=`echo "$1" | sed -e 's/86.*/86/'` + vendor=pc + basic_os=sysv32 ;; i*86v4*) - basic_machine=`echo "$1" | sed -e 's/86.*/86-pc/'` - os=-sysv4 + cpu=`echo "$1" | sed -e 's/86.*/86/'` + vendor=pc + basic_os=sysv4 ;; i*86v) - basic_machine=`echo "$1" | sed -e 's/86.*/86-pc/'` - os=-sysv + cpu=`echo "$1" | sed -e 's/86.*/86/'` + vendor=pc + basic_os=sysv ;; i*86sol2) - basic_machine=`echo "$1" | sed -e 's/86.*/86-pc/'` - os=-solaris2 - ;; - i386mach) - basic_machine=i386-mach - os=-mach + cpu=`echo "$1" | sed -e 's/86.*/86/'` + vendor=pc + basic_os=solaris2 ;; - vsta) - basic_machine=i386-unknown - os=-vsta + j90 | j90-cray) + cpu=j90 + vendor=cray + basic_os=${basic_os:-unicos} ;; iris | iris4d) - basic_machine=mips-sgi - case $os in - -irix*) + cpu=mips + vendor=sgi + case $basic_os in + irix*) ;; *) - os=-irix4 + basic_os=irix4 ;; esac ;; - isi68 | isi) - basic_machine=m68k-isi - os=-sysv - ;; - leon-*|leon[3-9]-*) - basic_machine=sparc-`echo "$basic_machine" | sed 's/-.*//'` - ;; - m68knommu) - basic_machine=m68k-unknown - os=-linux - ;; - m68knommu-*) - basic_machine=m68k-`echo "$basic_machine" | sed 's/^[^-]*-//'` - os=-linux - ;; - magnum | m3230) - basic_machine=mips-mips - os=-sysv - ;; - merlin) - basic_machine=ns32k-utek - os=-sysv - ;; - microblaze*) - basic_machine=microblaze-xilinx - ;; - mingw64) - basic_machine=x86_64-pc - os=-mingw64 - ;; - mingw32) - basic_machine=i686-pc - os=-mingw32 - ;; - mingw32ce) - basic_machine=arm-unknown - os=-mingw32ce - ;; miniframe) - basic_machine=m68000-convergent - ;; - *mint | -mint[0-9]* | *MiNT | *MiNT[0-9]*) - basic_machine=m68k-atari - os=-mint - ;; - mips3*-*) - basic_machine=`echo "$basic_machine" | sed -e 's/mips3/mips64/'` - ;; - mips3*) - basic_machine=`echo "$basic_machine" | sed -e 's/mips3/mips64/'`-unknown - ;; - monitor) - basic_machine=m68k-rom68k - os=-coff - ;; - morphos) - basic_machine=powerpc-unknown - os=-morphos - ;; - moxiebox) - basic_machine=moxie-unknown - os=-moxiebox + cpu=m68000 + vendor=convergent ;; - msdos) - basic_machine=i386-pc - os=-msdos - ;; - ms1-*) - basic_machine=`echo "$basic_machine" | sed -e 's/ms1-/mt-/'` - ;; - msys) - basic_machine=i686-pc - os=-msys - ;; - mvs) - basic_machine=i370-ibm - os=-mvs - ;; - nacl) - basic_machine=le32-unknown - os=-nacl - ;; - ncr3000) - basic_machine=i486-ncr - os=-sysv4 - ;; - netbsd386) - basic_machine=i386-unknown - os=-netbsd - ;; - netwinder) - basic_machine=armv4l-rebel - os=-linux - ;; - news | news700 | news800 | news900) - basic_machine=m68k-sony - os=-newsos - ;; - news1000) - basic_machine=m68030-sony - os=-newsos + *mint | mint[0-9]* | *MiNT | *MiNT[0-9]*) + cpu=m68k + vendor=atari + basic_os=mint ;; news-3600 | risc-news) - basic_machine=mips-sony - os=-newsos - ;; - necv70) - basic_machine=v70-nec - os=-sysv + cpu=mips + vendor=sony + basic_os=newsos ;; next | m*-next) - basic_machine=m68k-next - case $os in - -nextstep* ) + cpu=m68k + vendor=next + case $basic_os in + openstep*) + ;; + nextstep*) ;; - -ns2*) - os=-nextstep2 + ns2*) + basic_os=nextstep2 ;; *) - os=-nextstep3 + basic_os=nextstep3 ;; esac ;; - nh3000) - basic_machine=m68k-harris - os=-cxux - ;; - nh[45]000) - basic_machine=m88k-harris - os=-cxux - ;; - nindy960) - basic_machine=i960-intel - os=-nindy - ;; - mon960) - basic_machine=i960-intel - os=-mon960 - ;; - nonstopux) - basic_machine=mips-compaq - os=-nonstopux - ;; np1) - basic_machine=np1-gould - ;; - neo-tandem) - basic_machine=neo-tandem - ;; - nse-tandem) - basic_machine=nse-tandem - ;; - nsr-tandem) - basic_machine=nsr-tandem - ;; - nsv-tandem) - basic_machine=nsv-tandem - ;; - nsx-tandem) - basic_machine=nsx-tandem + cpu=np1 + vendor=gould ;; op50n-* | op60c-*) - basic_machine=hppa1.1-oki - os=-proelf - ;; - openrisc | openrisc-*) - basic_machine=or32-unknown - ;; - os400) - basic_machine=powerpc-ibm - os=-os400 - ;; - OSE68000 | ose68000) - basic_machine=m68000-ericsson - os=-ose - ;; - os68k) - basic_machine=m68k-none - os=-os68k + cpu=hppa1.1 + vendor=oki + basic_os=proelf ;; pa-hitachi) - basic_machine=hppa1.1-hitachi - os=-hiuxwe2 - ;; - paragon) - basic_machine=i860-intel - os=-osf - ;; - parisc) - basic_machine=hppa-unknown - os=-linux - ;; - parisc-*) - basic_machine=hppa-`echo "$basic_machine" | sed 's/^[^-]*-//'` - os=-linux + cpu=hppa1.1 + vendor=hitachi + basic_os=hiuxwe2 ;; pbd) - basic_machine=sparc-tti + cpu=sparc + vendor=tti ;; pbb) - basic_machine=m68k-tti - ;; - pc532 | pc532-*) - basic_machine=ns32k-pc532 - ;; - pc98) - basic_machine=i386-pc + cpu=m68k + vendor=tti ;; - pc98-*) - basic_machine=i386-`echo "$basic_machine" | sed 's/^[^-]*-//'` - ;; - pentium | p5 | k5 | k6 | nexgen | viac3) - basic_machine=i586-pc - ;; - pentiumpro | p6 | 6x86 | athlon | athlon_*) - basic_machine=i686-pc - ;; - pentiumii | pentium2 | pentiumiii | pentium3) - basic_machine=i686-pc - ;; - pentium4) - basic_machine=i786-pc - ;; - pentium-* | p5-* | k5-* | k6-* | nexgen-* | viac3-*) - basic_machine=i586-`echo "$basic_machine" | sed 's/^[^-]*-//'` - ;; - pentiumpro-* | p6-* | 6x86-* | athlon-*) - basic_machine=i686-`echo "$basic_machine" | sed 's/^[^-]*-//'` - ;; - pentiumii-* | pentium2-* | pentiumiii-* | pentium3-*) - basic_machine=i686-`echo "$basic_machine" | sed 's/^[^-]*-//'` - ;; - pentium4-*) - basic_machine=i786-`echo "$basic_machine" | sed 's/^[^-]*-//'` + pc532) + cpu=ns32k + vendor=pc532 ;; pn) - basic_machine=pn-gould - ;; - power) basic_machine=power-ibm - ;; - ppc | ppcbe) basic_machine=powerpc-unknown + cpu=pn + vendor=gould ;; - ppc-* | ppcbe-*) - basic_machine=powerpc-`echo "$basic_machine" | sed 's/^[^-]*-//'` - ;; - ppcle | powerpclittle) - basic_machine=powerpcle-unknown - ;; - ppcle-* | powerpclittle-*) - basic_machine=powerpcle-`echo "$basic_machine" | sed 's/^[^-]*-//'` + power) + cpu=power + vendor=ibm ;; - ppc64) basic_machine=powerpc64-unknown + ps2) + cpu=i386 + vendor=ibm ;; - ppc64-*) basic_machine=powerpc64-`echo "$basic_machine" | sed 's/^[^-]*-//'` + rm[46]00) + cpu=mips + vendor=siemens ;; - ppc64le | powerpc64little) - basic_machine=powerpc64le-unknown + rtpc | rtpc-*) + cpu=romp + vendor=ibm ;; - ppc64le-* | powerpc64little-*) - basic_machine=powerpc64le-`echo "$basic_machine" | sed 's/^[^-]*-//'` + sde) + cpu=mipsisa32 + vendor=sde + basic_os=${basic_os:-elf} ;; - ps2) - basic_machine=i386-ibm + simso-wrs) + cpu=sparclite + vendor=wrs + basic_os=vxworks ;; - pw32) - basic_machine=i586-unknown - os=-pw32 + tower | tower-32) + cpu=m68k + vendor=ncr ;; - rdos | rdos64) - basic_machine=x86_64-pc - os=-rdos + vpp*|vx|vx-*) + cpu=f301 + vendor=fujitsu ;; - rdos32) - basic_machine=i386-pc - os=-rdos + w65) + cpu=w65 + vendor=wdc ;; - rom68k) - basic_machine=m68k-rom68k - os=-coff + w89k-*) + cpu=hppa1.1 + vendor=winbond + basic_os=proelf ;; - rm[46]00) - basic_machine=mips-siemens + none) + cpu=none + vendor=none ;; - rtpc | rtpc-*) - basic_machine=romp-ibm + leon|leon[3-9]) + cpu=sparc + vendor=$basic_machine ;; - s390 | s390-*) - basic_machine=s390-ibm + leon-*|leon[3-9]-*) + cpu=sparc + vendor=`echo "$basic_machine" | sed 's/-.*//'` ;; - s390x | s390x-*) - basic_machine=s390x-ibm + + *-*) + # shellcheck disable=SC2162 + IFS="-" read cpu vendor <<EOF +$basic_machine +EOF ;; - sa29200) - basic_machine=a29k-amd - os=-udi + # We use `pc' rather than `unknown' + # because (1) that's what they normally are, and + # (2) the word "unknown" tends to confuse beginning users. + i*86 | x86_64) + cpu=$basic_machine + vendor=pc ;; - sb1) - basic_machine=mipsisa64sb1-unknown + # These rules are duplicated from below for sake of the special case above; + # i.e. things that normalized to x86 arches should also default to "pc" + pc98) + cpu=i386 + vendor=pc ;; - sb1el) - basic_machine=mipsisa64sb1el-unknown + x64 | amd64) + cpu=x86_64 + vendor=pc ;; - sde) - basic_machine=mipsisa32-sde - os=-elf + # Recognize the basic CPU types without company name. + *) + cpu=$basic_machine + vendor=unknown ;; - sei) - basic_machine=mips-sei - os=-seiux +esac + +unset -v basic_machine + +# Decode basic machines in the full and proper CPU-Company form. +case $cpu-$vendor in + # Here we handle the default manufacturer of certain CPU types in canonical form. It is in + # some cases the only manufacturer, in others, it is the most popular. + craynv-unknown) + vendor=cray + basic_os=${basic_os:-unicosmp} ;; - sequent) - basic_machine=i386-sequent + c90-unknown | c90-cray) + vendor=cray + basic_os=${Basic_os:-unicos} ;; - sh5el) - basic_machine=sh5le-unknown + fx80-unknown) + vendor=alliant ;; - simso-wrs) - basic_machine=sparclite-wrs - os=-vxworks + romp-unknown) + vendor=ibm ;; - sps7) - basic_machine=m68k-bull - os=-sysv2 + mmix-unknown) + vendor=knuth ;; - spur) - basic_machine=spur-unknown + microblaze-unknown | microblazeel-unknown) + vendor=xilinx ;; - st2000) - basic_machine=m68k-tandem + rs6000-unknown) + vendor=ibm ;; - stratus) - basic_machine=i860-stratus - os=-sysv4 + vax-unknown) + vendor=dec ;; - strongarm-* | thumb-*) - basic_machine=arm-`echo "$basic_machine" | sed 's/^[^-]*-//'` + pdp11-unknown) + vendor=dec ;; - sun2) - basic_machine=m68000-sun + we32k-unknown) + vendor=att ;; - sun2os3) - basic_machine=m68000-sun - os=-sunos3 + cydra-unknown) + vendor=cydrome ;; - sun2os4) - basic_machine=m68000-sun - os=-sunos4 + i370-ibm*) + vendor=ibm ;; - sun3os3) - basic_machine=m68k-sun - os=-sunos3 + orion-unknown) + vendor=highlevel ;; - sun3os4) - basic_machine=m68k-sun - os=-sunos4 + xps-unknown | xps100-unknown) + cpu=xps100 + vendor=honeywell ;; - sun4os3) - basic_machine=sparc-sun - os=-sunos3 + + # Here we normalize CPU types with a missing or matching vendor + dpx20-unknown | dpx20-bull) + cpu=rs6000 + vendor=bull + basic_os=${basic_os:-bosx} ;; - sun4os4) - basic_machine=sparc-sun - os=-sunos4 + + # Here we normalize CPU types irrespective of the vendor + amd64-*) + cpu=x86_64 ;; - sun4sol2) - basic_machine=sparc-sun - os=-solaris2 + blackfin-*) + cpu=bfin + basic_os=linux ;; - sun3 | sun3-*) - basic_machine=m68k-sun + c54x-*) + cpu=tic54x ;; - sun4) - basic_machine=sparc-sun + c55x-*) + cpu=tic55x ;; - sun386 | sun386i | roadrunner) - basic_machine=i386-sun + c6x-*) + cpu=tic6x ;; - sv1) - basic_machine=sv1-cray - os=-unicos + e500v[12]-*) + cpu=powerpc + basic_os=${basic_os}"spe" ;; - symmetry) - basic_machine=i386-sequent - os=-dynix + mips3*-*) + cpu=mips64 ;; - t3e) - basic_machine=alphaev5-cray - os=-unicos + ms1-*) + cpu=mt ;; - t90) - basic_machine=t90-cray - os=-unicos + m68knommu-*) + cpu=m68k + basic_os=linux ;; - tile*) - basic_machine=$basic_machine-unknown - os=-linux-gnu + m9s12z-* | m68hcs12z-* | hcs12z-* | s12z-*) + cpu=s12z ;; - tx39) - basic_machine=mipstx39-unknown + openrisc-*) + cpu=or32 ;; - tx39el) - basic_machine=mipstx39el-unknown + parisc-*) + cpu=hppa + basic_os=linux ;; - toad1) - basic_machine=pdp10-xkl - os=-tops20 + pentium-* | p5-* | k5-* | k6-* | nexgen-* | viac3-*) + cpu=i586 ;; - tower | tower-32) - basic_machine=m68k-ncr + pentiumpro-* | p6-* | 6x86-* | athlon-* | athalon_*-*) + cpu=i686 ;; - tpf) - basic_machine=s390x-ibm - os=-tpf + pentiumii-* | pentium2-* | pentiumiii-* | pentium3-*) + cpu=i686 ;; - udi29k) - basic_machine=a29k-amd - os=-udi + pentium4-*) + cpu=i786 ;; - ultra3) - basic_machine=a29k-nyu - os=-sym1 + pc98-*) + cpu=i386 ;; - v810 | necv810) - basic_machine=v810-nec - os=-none + ppc-* | ppcbe-*) + cpu=powerpc ;; - vaxv) - basic_machine=vax-dec - os=-sysv + ppcle-* | powerpclittle-*) + cpu=powerpcle ;; - vms) - basic_machine=vax-dec - os=-vms + ppc64-*) + cpu=powerpc64 ;; - vpp*|vx|vx-*) - basic_machine=f301-fujitsu + ppc64le-* | powerpc64little-*) + cpu=powerpc64le ;; - vxworks960) - basic_machine=i960-wrs - os=-vxworks + sb1-*) + cpu=mipsisa64sb1 ;; - vxworks68) - basic_machine=m68k-wrs - os=-vxworks + sb1el-*) + cpu=mipsisa64sb1el ;; - vxworks29k) - basic_machine=a29k-wrs - os=-vxworks + sh5e[lb]-*) + cpu=`echo "$cpu" | sed 's/^\(sh.\)e\(.\)$/\1\2e/'` ;; - w65*) - basic_machine=w65-wdc - os=-none + spur-*) + cpu=spur ;; - w89k-*) - basic_machine=hppa1.1-winbond - os=-proelf + strongarm-* | thumb-*) + cpu=arm ;; - x64) - basic_machine=x86_64-pc + tx39-*) + cpu=mipstx39 ;; - xbox) - basic_machine=i686-pc - os=-mingw32 + tx39el-*) + cpu=mipstx39el ;; - xps | xps100) - basic_machine=xps100-honeywell + x64-*) + cpu=x86_64 ;; xscale-* | xscalee[bl]-*) - basic_machine=`echo "$basic_machine" | sed 's/^xscale/arm/'` - ;; - ymp) - basic_machine=ymp-cray - os=-unicos + cpu=`echo "$cpu" | sed 's/^xscale/arm/'` ;; - none) - basic_machine=none-none - os=-none + arm64-*) + cpu=aarch64 ;; -# Here we handle the default manufacturer of certain CPU types. It is in -# some cases the only manufacturer, in others, it is the most popular. - w89k) - basic_machine=hppa1.1-winbond - ;; - op50n) - basic_machine=hppa1.1-oki - ;; - op60c) - basic_machine=hppa1.1-oki - ;; - romp) - basic_machine=romp-ibm + # Recognize the canonical CPU Types that limit and/or modify the + # company names they are paired with. + cr16-*) + basic_os=${basic_os:-elf} ;; - mmix) - basic_machine=mmix-knuth + crisv32-* | etraxfs*-*) + cpu=crisv32 + vendor=axis ;; - rs6000) - basic_machine=rs6000-ibm + cris-* | etrax*-*) + cpu=cris + vendor=axis ;; - vax) - basic_machine=vax-dec + crx-*) + basic_os=${basic_os:-elf} ;; - pdp11) - basic_machine=pdp11-dec - ;; - we32k) - basic_machine=we32k-att - ;; - sh[1234] | sh[24]a | sh[24]aeb | sh[34]eb | sh[1234]le | sh[23]ele) - basic_machine=sh-unknown + neo-tandem) + cpu=neo + vendor=tandem ;; - cydra) - basic_machine=cydra-cydrome + nse-tandem) + cpu=nse + vendor=tandem ;; - orion) - basic_machine=orion-highlevel + nsr-tandem) + cpu=nsr + vendor=tandem ;; - orion105) - basic_machine=clipper-highlevel + nsv-tandem) + cpu=nsv + vendor=tandem ;; - mac | mpw | mac-mpw) - basic_machine=m68k-apple + nsx-tandem) + cpu=nsx + vendor=tandem ;; - pmac | pmac-mpw) - basic_machine=powerpc-apple + mipsallegrexel-sony) + cpu=mipsallegrexel + vendor=sony ;; - *-unknown) - # Make sure to match an already-canonicalized machine name. + tile*-*) + basic_os=${basic_os:-linux-gnu} ;; + *) - echo Invalid configuration \`"$1"\': machine \`"$basic_machine"\' not recognized 1>&2 - exit 1 + # Recognize the canonical CPU types that are allowed with any + # company name. + case $cpu in + 1750a | 580 \ + | a29k \ + | aarch64 | aarch64_be \ + | abacus \ + | alpha | alphaev[4-8] | alphaev56 | alphaev6[78] \ + | alpha64 | alpha64ev[4-8] | alpha64ev56 | alpha64ev6[78] \ + | alphapca5[67] | alpha64pca5[67] \ + | am33_2.0 \ + | amdgcn \ + | arc | arceb \ + | arm | arm[lb]e | arme[lb] | armv* \ + | avr | avr32 \ + | asmjs \ + | ba \ + | be32 | be64 \ + | bfin | bpf | bs2000 \ + | c[123]* | c30 | [cjt]90 | c4x \ + | c8051 | clipper | craynv | csky | cydra \ + | d10v | d30v | dlx | dsp16xx \ + | e2k | elxsi | epiphany \ + | f30[01] | f700 | fido | fr30 | frv | ft32 | fx80 \ + | h8300 | h8500 \ + | hppa | hppa1.[01] | hppa2.0 | hppa2.0[nw] | hppa64 \ + | hexagon \ + | i370 | i*86 | i860 | i960 | ia16 | ia64 \ + | ip2k | iq2000 \ + | k1om \ + | le32 | le64 \ + | lm32 \ + | m32c | m32r | m32rle \ + | m5200 | m68000 | m680[012346]0 | m68360 | m683?2 | m68k \ + | m6811 | m68hc11 | m6812 | m68hc12 | m68hcs12x \ + | m88110 | m88k | maxq | mb | mcore | mep | metag \ + | microblaze | microblazeel \ + | mips | mipsbe | mipseb | mipsel | mipsle \ + | mips16 \ + | mips64 | mips64eb | mips64el \ + | mips64octeon | mips64octeonel \ + | mips64orion | mips64orionel \ + | mips64r5900 | mips64r5900el \ + | mips64vr | mips64vrel \ + | mips64vr4100 | mips64vr4100el \ + | mips64vr4300 | mips64vr4300el \ + | mips64vr5000 | mips64vr5000el \ + | mips64vr5900 | mips64vr5900el \ + | mipsisa32 | mipsisa32el \ + | mipsisa32r2 | mipsisa32r2el \ + | mipsisa32r6 | mipsisa32r6el \ + | mipsisa64 | mipsisa64el \ + | mipsisa64r2 | mipsisa64r2el \ + | mipsisa64r6 | mipsisa64r6el \ + | mipsisa64sb1 | mipsisa64sb1el \ + | mipsisa64sr71k | mipsisa64sr71kel \ + | mipsr5900 | mipsr5900el \ + | mipstx39 | mipstx39el \ + | mmix \ + | mn10200 | mn10300 \ + | moxie \ + | mt \ + | msp430 \ + | nds32 | nds32le | nds32be \ + | nfp \ + | nios | nios2 | nios2eb | nios2el \ + | none | np1 | ns16k | ns32k | nvptx \ + | open8 \ + | or1k* \ + | or32 \ + | orion \ + | picochip \ + | pdp10 | pdp11 | pj | pjl | pn | power \ + | powerpc | powerpc64 | powerpc64le | powerpcle | powerpcspe \ + | pru \ + | pyramid \ + | riscv | riscv32 | riscv64 \ + | rl78 | romp | rs6000 | rx \ + | s390 | s390x \ + | score \ + | sh | shl \ + | sh[1234] | sh[24]a | sh[24]ae[lb] | sh[23]e | she[lb] | sh[lb]e \ + | sh[1234]e[lb] | sh[12345][lb]e | sh[23]ele | sh64 | sh64le \ + | sparc | sparc64 | sparc64b | sparc64v | sparc86x | sparclet \ + | sparclite \ + | sparcv8 | sparcv9 | sparcv9b | sparcv9v | sv1 | sx* \ + | spu \ + | tahoe \ + | tic30 | tic4x | tic54x | tic55x | tic6x | tic80 \ + | tron \ + | ubicom32 \ + | v70 | v850 | v850e | v850e1 | v850es | v850e2 | v850e2v3 \ + | vax \ + | visium \ + | w65 \ + | wasm32 | wasm64 \ + | we32k \ + | x86 | x86_64 | xc16x | xgate | xps100 \ + | xstormy16 | xtensa* \ + | ymp \ + | z8k | z80) + ;; + + *) + echo Invalid configuration \`"$1"\': machine \`"$cpu-$vendor"\' not recognized 1>&2 + exit 1 + ;; + esac ;; esac # Here we canonicalize certain aliases for manufacturers. -case $basic_machine in - *-digital*) - basic_machine=`echo "$basic_machine" | sed 's/digital.*/dec/'` +case $vendor in + digital*) + vendor=dec ;; - *-commodore*) - basic_machine=`echo "$basic_machine" | sed 's/commodore.*/cbm/'` + commodore*) + vendor=cbm ;; *) ;; @@ -1334,203 +1278,215 @@ esac # Decode manufacturer-specific aliases for certain operating systems. -if [ x"$os" != x"" ] +if test x$basic_os != x then + +# First recognize some ad-hoc caes, or perhaps split kernel-os, or else just +# set os. +case $basic_os in + gnu/linux*) + kernel=linux + os=`echo $basic_os | sed -e 's|gnu/linux|gnu|'` + ;; + nto-qnx*) + kernel=nto + os=`echo $basic_os | sed -e 's|nto-qnx|qnx|'` + ;; + *-*) + # shellcheck disable=SC2162 + IFS="-" read kernel os <<EOF +$basic_os +EOF + ;; + # Default OS when just kernel was specified + nto*) + kernel=nto + os=`echo $basic_os | sed -e 's|nto|qnx|'` + ;; + linux*) + kernel=linux + os=`echo $basic_os | sed -e 's|linux|gnu|'` + ;; + *) + kernel= + os=$basic_os + ;; +esac + +# Now, normalize the OS (knowing we just have one component, it's not a kernel, +# etc.) case $os in # First match some system type aliases that might get confused # with valid system types. - # -solaris* is a basic system type, with this one exception. - -auroraux) - os=-auroraux + # solaris* is a basic system type, with this one exception. + auroraux) + os=auroraux ;; - -solaris1 | -solaris1.*) - os=`echo $os | sed -e 's|solaris1|sunos4|'` + bluegene*) + os=cnk ;; - -solaris) - os=-solaris2 + solaris1 | solaris1.*) + os=`echo $os | sed -e 's|solaris1|sunos4|'` ;; - -unixware*) - os=-sysv4.2uw + solaris) + os=solaris2 ;; - -gnu/linux*) - os=`echo $os | sed -e 's|gnu/linux|linux-gnu|'` + unixware*) + os=sysv4.2uw ;; # es1800 is here to avoid being matched by es* (a different OS) - -es1800*) - os=-ose + es1800*) + os=ose ;; - # Now accept the basic system types. - # The portable systems comes first. - # Each alternative MUST end in a * to match a version number. - # -sysv* is not here because it comes later, after sysvr4. - -gnu* | -bsd* | -mach* | -minix* | -genix* | -ultrix* | -irix* \ - | -*vms* | -sco* | -esix* | -isc* | -aix* | -cnk* | -sunos | -sunos[34]*\ - | -hpux* | -unos* | -osf* | -luna* | -dgux* | -auroraux* | -solaris* \ - | -sym* | -kopensolaris* | -plan9* \ - | -amigaos* | -amigados* | -msdos* | -newsos* | -unicos* | -aof* \ - | -aos* | -aros* | -cloudabi* | -sortix* \ - | -nindy* | -vxsim* | -vxworks* | -ebmon* | -hms* | -mvs* \ - | -clix* | -riscos* | -uniplus* | -iris* | -rtu* | -xenix* \ - | -hiux* | -knetbsd* | -mirbsd* | -netbsd* \ - | -bitrig* | -openbsd* | -solidbsd* | -libertybsd* \ - | -ekkobsd* | -kfreebsd* | -freebsd* | -riscix* | -lynxos* \ - | -bosx* | -nextstep* | -cxux* | -aout* | -elf* | -oabi* \ - | -ptx* | -coff* | -ecoff* | -winnt* | -domain* | -vsta* \ - | -udi* | -eabi* | -lites* | -ieee* | -go32* | -aux* | -hcos* \ - | -chorusos* | -chorusrdb* | -cegcc* | -glidix* \ - | -cygwin* | -msys* | -pe* | -psos* | -moss* | -proelf* | -rtems* \ - | -midipix* | -mingw32* | -mingw64* | -linux-gnu* | -linux-android* \ - | -linux-newlib* | -linux-musl* | -linux-uclibc* \ - | -uxpv* | -beos* | -mpeix* | -udk* | -moxiebox* \ - | -interix* | -uwin* | -mks* | -rhapsody* | -darwin* \ - | -openstep* | -oskit* | -conix* | -pw32* | -nonstopux* \ - | -storm-chaos* | -tops10* | -tenex* | -tops20* | -its* \ - | -os2* | -vos* | -palmos* | -uclinux* | -nucleus* \ - | -morphos* | -superux* | -rtmk* | -windiss* \ - | -powermax* | -dnix* | -nx6 | -nx7 | -sei* | -dragonfly* \ - | -skyos* | -haiku* | -rdos* | -toppers* | -drops* | -es* \ - | -onefs* | -tirtos* | -phoenix* | -fuchsia* | -redox* | -bme* \ - | -midnightbsd*) - # Remember, each alternative MUST END IN *, to match a version number. - ;; - -qnx*) - case $basic_machine in - x86-* | i*86-*) + # Some version numbers need modification + chorusos*) + os=chorusos + ;; + isc) + os=isc2.2 + ;; + sco6) + os=sco5v6 + ;; + sco5) + os=sco3.2v5 + ;; + sco4) + os=sco3.2v4 + ;; + sco3.2.[4-9]*) + os=`echo $os | sed -e 's/sco3.2./sco3.2v/'` + ;; + sco*v* | scout) + # Don't match below + ;; + sco*) + os=sco3.2v2 + ;; + psos*) + os=psos + ;; + qnx*) + case $cpu in + x86 | i*86) ;; *) - os=-nto$os + os=nto-$os ;; esac ;; - -nto-qnx*) + hiux*) + os=hiuxwe2 + ;; + lynx*178) + os=lynxos178 ;; - -nto*) - os=`echo $os | sed -e 's|nto|nto-qnx|'` + lynx*5) + os=lynxos5 ;; - -sim | -xray | -os68k* | -v88r* \ - | -windows* | -osx | -abug | -netware* | -os9* \ - | -macos* | -mpw* | -magic* | -mmixware* | -mon960* | -lnews*) + lynxos*) + # don't get caught up in next wildcard ;; - -mac*) + lynx*) + os=lynxos + ;; + mac[0-9]*) os=`echo "$os" | sed -e 's|mac|macos|'` ;; - -linux-dietlibc) - os=-linux-dietlibc + opened*) + os=openedition ;; - -linux*) - os=`echo $os | sed -e 's|linux|linux-gnu|'` + os400*) + os=os400 ;; - -sunos5*) + sunos5*) os=`echo "$os" | sed -e 's|sunos5|solaris2|'` ;; - -sunos6*) + sunos6*) os=`echo "$os" | sed -e 's|sunos6|solaris3|'` ;; - -opened*) - os=-openedition - ;; - -os400*) - os=-os400 - ;; - -wince*) - os=-wince + wince*) + os=wince ;; - -utek*) - os=-bsd + utek*) + os=bsd ;; - -dynix*) - os=-bsd + dynix*) + os=bsd ;; - -acis*) - os=-aos + acis*) + os=aos ;; - -atheos*) - os=-atheos + atheos*) + os=atheos ;; - -syllable*) - os=-syllable + syllable*) + os=syllable ;; - -386bsd) - os=-bsd - ;; - -ctix* | -uts*) - os=-sysv + 386bsd) + os=bsd ;; - -nova*) - os=-rtmk-nova + ctix* | uts*) + os=sysv ;; - -ns2) - os=-nextstep2 + nova*) + os=rtmk-nova ;; - -nsk*) - os=-nsk + ns2) + os=nextstep2 ;; # Preserve the version number of sinix5. - -sinix5.*) + sinix5.*) os=`echo $os | sed -e 's|sinix|sysv|'` ;; - -sinix*) - os=-sysv4 - ;; - -tpf*) - os=-tpf - ;; - -triton*) - os=-sysv3 + sinix*) + os=sysv4 ;; - -oss*) - os=-sysv3 + tpf*) + os=tpf ;; - -svr4*) - os=-sysv4 + triton*) + os=sysv3 ;; - -svr3) - os=-sysv3 + oss*) + os=sysv3 ;; - -sysvr4) - os=-sysv4 + svr4*) + os=sysv4 ;; - # This must come after -sysvr4. - -sysv*) + svr3) + os=sysv3 ;; - -ose*) - os=-ose + sysvr4) + os=sysv4 ;; - -*mint | -mint[0-9]* | -*MiNT | -MiNT[0-9]*) - os=-mint + ose*) + os=ose ;; - -zvmoe) - os=-zvmoe + *mint | mint[0-9]* | *MiNT | MiNT[0-9]*) + os=mint ;; - -dicos*) - os=-dicos + dicos*) + os=dicos ;; - -pikeos*) + pikeos*) # Until real need of OS specific support for # particular features comes up, bare metal # configurations are quite functional. - case $basic_machine in + case $cpu in arm*) - os=-eabi + os=eabi ;; *) - os=-elf + os=elf ;; esac ;; - -nacl*) - ;; - -ios) - ;; - -none) - ;; *) - # Get rid of the `-' at the beginning of $os. - os=`echo $os | sed 's/[^-]*-//'` - echo Invalid configuration \`"$1"\': system \`"$os"\' not recognized 1>&2 - exit 1 + # No normalization, but not necessarily accepted, that comes below. ;; esac + else # Here we handle the default operating systems that come with various machines. @@ -1543,254 +1499,348 @@ else # will signal an error saying that MANUFACTURER isn't an operating # system, and we'll never get to this point. -case $basic_machine in +kernel= +case $cpu-$vendor in score-*) - os=-elf + os=elf ;; spu-*) - os=-elf + os=elf ;; *-acorn) - os=-riscix1.2 + os=riscix1.2 ;; arm*-rebel) - os=-linux + kernel=linux + os=gnu ;; arm*-semi) - os=-aout + os=aout ;; c4x-* | tic4x-*) - os=-coff + os=coff ;; c8051-*) - os=-elf + os=elf + ;; + clipper-intergraph) + os=clix ;; hexagon-*) - os=-elf + os=elf ;; tic54x-*) - os=-coff + os=coff ;; tic55x-*) - os=-coff + os=coff ;; tic6x-*) - os=-coff + os=coff ;; # This must come before the *-dec entry. pdp10-*) - os=-tops20 + os=tops20 ;; pdp11-*) - os=-none + os=none ;; *-dec | vax-*) - os=-ultrix4.2 + os=ultrix4.2 ;; m68*-apollo) - os=-domain + os=domain ;; i386-sun) - os=-sunos4.0.2 + os=sunos4.0.2 ;; m68000-sun) - os=-sunos3 + os=sunos3 ;; m68*-cisco) - os=-aout + os=aout ;; mep-*) - os=-elf + os=elf ;; mips*-cisco) - os=-elf + os=elf ;; mips*-*) - os=-elf + os=elf ;; or32-*) - os=-coff + os=coff ;; *-tti) # must be before sparc entry or we get the wrong os. - os=-sysv3 + os=sysv3 ;; sparc-* | *-sun) - os=-sunos4.1.1 + os=sunos4.1.1 ;; pru-*) - os=-elf + os=elf ;; *-be) - os=-beos + os=beos ;; *-ibm) - os=-aix + os=aix ;; *-knuth) - os=-mmixware + os=mmixware ;; *-wec) - os=-proelf + os=proelf ;; *-winbond) - os=-proelf + os=proelf ;; *-oki) - os=-proelf + os=proelf ;; *-hp) - os=-hpux + os=hpux ;; *-hitachi) - os=-hiux + os=hiux ;; i860-* | *-att | *-ncr | *-altos | *-motorola | *-convergent) - os=-sysv + os=sysv ;; *-cbm) - os=-amigaos + os=amigaos ;; *-dg) - os=-dgux + os=dgux ;; *-dolphin) - os=-sysv3 + os=sysv3 ;; m68k-ccur) - os=-rtu + os=rtu ;; m88k-omron*) - os=-luna + os=luna ;; *-next) - os=-nextstep + os=nextstep ;; *-sequent) - os=-ptx + os=ptx ;; *-crds) - os=-unos + os=unos ;; *-ns) - os=-genix + os=genix ;; i370-*) - os=-mvs + os=mvs ;; *-gould) - os=-sysv + os=sysv ;; *-highlevel) - os=-bsd + os=bsd ;; *-encore) - os=-bsd + os=bsd ;; *-sgi) - os=-irix + os=irix ;; *-siemens) - os=-sysv4 + os=sysv4 ;; *-masscomp) - os=-rtu + os=rtu ;; f30[01]-fujitsu | f700-fujitsu) - os=-uxpv + os=uxpv ;; *-rom68k) - os=-coff + os=coff ;; *-*bug) - os=-coff + os=coff ;; *-apple) - os=-macos + os=macos ;; *-atari*) - os=-mint + os=mint + ;; + *-wrs) + os=vxworks ;; *) - os=-none + os=none ;; esac + fi +# Now, validate our (potentially fixed-up) OS. +case $os in + # Sometimes we do "kernel-abi", so those need to count as OSes. + musl* | newlib* | uclibc*) + ;; + # Likewise for "kernel-libc" + eabi | eabihf | gnueabi | gnueabihf) + ;; + # Now accept the basic system types. + # The portable systems comes first. + # Each alternative MUST end in a * to match a version number. + gnu* | android* | bsd* | mach* | minix* | genix* | ultrix* | irix* \ + | *vms* | esix* | aix* | cnk* | sunos | sunos[34]* \ + | hpux* | unos* | osf* | luna* | dgux* | auroraux* | solaris* \ + | sym* | plan9* | psp* | sim* | xray* | os68k* | v88r* \ + | hiux* | abug | nacl* | netware* | windows* \ + | os9* | macos* | osx* | ios* \ + | mpw* | magic* | mmixware* | mon960* | lnews* \ + | amigaos* | amigados* | msdos* | newsos* | unicos* | aof* \ + | aos* | aros* | cloudabi* | sortix* | twizzler* \ + | nindy* | vxsim* | vxworks* | ebmon* | hms* | mvs* \ + | clix* | riscos* | uniplus* | iris* | isc* | rtu* | xenix* \ + | mirbsd* | netbsd* | dicos* | openedition* | ose* \ + | bitrig* | openbsd* | solidbsd* | libertybsd* | os108* \ + | ekkobsd* | freebsd* | riscix* | lynxos* | os400* \ + | bosx* | nextstep* | cxux* | aout* | elf* | oabi* \ + | ptx* | coff* | ecoff* | winnt* | domain* | vsta* \ + | udi* | lites* | ieee* | go32* | aux* | hcos* \ + | chorusrdb* | cegcc* | glidix* \ + | cygwin* | msys* | pe* | moss* | proelf* | rtems* \ + | midipix* | mingw32* | mingw64* | mint* \ + | uxpv* | beos* | mpeix* | udk* | moxiebox* \ + | interix* | uwin* | mks* | rhapsody* | darwin* \ + | openstep* | oskit* | conix* | pw32* | nonstopux* \ + | storm-chaos* | tops10* | tenex* | tops20* | its* \ + | os2* | vos* | palmos* | uclinux* | nucleus* | morphos* \ + | scout* | superux* | sysv* | rtmk* | tpf* | windiss* \ + | powermax* | dnix* | nx6 | nx7 | sei* | dragonfly* \ + | skyos* | haiku* | rdos* | toppers* | drops* | es* \ + | onefs* | tirtos* | phoenix* | fuchsia* | redox* | bme* \ + | midnightbsd* | amdhsa* | unleashed* | emscripten* | wasi* \ + | nsk* | powerunix* | genode* | zvmoe* ) + ;; + # This one is extra strict with allowed versions + sco3.2v2 | sco3.2v[4-9]* | sco5v6*) + # Don't forget version if it is 3.2v4 or newer. + ;; + none) + ;; + *) + echo Invalid configuration \`"$1"\': OS \`"$os"\' not recognized 1>&2 + exit 1 + ;; +esac + +# As a final step for OS-related things, validate the OS-kernel combination +# (given a valid OS), if there is a kernel. +case $kernel-$os in + linux-gnu* | linux-dietlibc* | linux-android* | linux-newlib* | linux-musl* | linux-uclibc* ) + ;; + -dietlibc* | -newlib* | -musl* | -uclibc* ) + # These are just libc implementations, not actual OSes, and thus + # require a kernel. + echo "Invalid configuration \`$1': libc \`$os' needs explicit kernel." 1>&2 + exit 1 + ;; + kfreebsd*-gnu* | kopensolaris*-gnu*) + ;; + nto-qnx*) + ;; + *-eabi* | *-gnueabi*) + ;; + -*) + # Blank kernel with real OS is always fine. + ;; + *-*) + echo "Invalid configuration \`$1': Kernel \`$kernel' not known to work with OS \`$os'." 1>&2 + exit 1 + ;; +esac + # Here we handle the case where we know the os, and the CPU type, but not the # manufacturer. We pick the logical manufacturer. -vendor=unknown -case $basic_machine in - *-unknown) - case $os in - -riscix*) +case $vendor in + unknown) + case $cpu-$os in + *-riscix*) vendor=acorn ;; - -sunos*) + *-sunos*) vendor=sun ;; - -cnk*|-aix*) + *-cnk* | *-aix*) vendor=ibm ;; - -beos*) + *-beos*) vendor=be ;; - -hpux*) + *-hpux*) vendor=hp ;; - -mpeix*) + *-mpeix*) vendor=hp ;; - -hiux*) + *-hiux*) vendor=hitachi ;; - -unos*) + *-unos*) vendor=crds ;; - -dgux*) + *-dgux*) vendor=dg ;; - -luna*) + *-luna*) vendor=omron ;; - -genix*) + *-genix*) vendor=ns ;; - -mvs* | -opened*) + *-clix*) + vendor=intergraph + ;; + *-mvs* | *-opened*) + vendor=ibm + ;; + *-os400*) vendor=ibm ;; - -os400*) + s390-* | s390x-*) vendor=ibm ;; - -ptx*) + *-ptx*) vendor=sequent ;; - -tpf*) + *-tpf*) vendor=ibm ;; - -vxsim* | -vxworks* | -windiss*) + *-vxsim* | *-vxworks* | *-windiss*) vendor=wrs ;; - -aux*) + *-aux*) vendor=apple ;; - -hms*) + *-hms*) vendor=hitachi ;; - -mpw* | -macos*) + *-mpw* | *-macos*) vendor=apple ;; - -*mint | -mint[0-9]* | -*MiNT | -MiNT[0-9]*) + *-*mint | *-mint[0-9]* | *-*MiNT | *-MiNT[0-9]*) vendor=atari ;; - -vos*) + *-vos*) vendor=stratus ;; esac - basic_machine=`echo "$basic_machine" | sed "s/unknown/$vendor/"` ;; esac -echo "$basic_machine$os" +echo "$cpu-$vendor-${kernel:+$kernel-}$os" exit # Local variables: diff --git a/configure.ac b/configure.ac index 2efaf19..1afa37e 100644 --- a/configure.ac +++ b/configure.ac @@ -1,5 +1,5 @@ dnl Process this file with autoconf to produce a configure script. -AC_INIT(onig, 6.9.6) +AC_INIT(onig, 6.9.7) AC_CONFIG_MACRO_DIR([m4]) @@ -57,7 +57,7 @@ fi dnl Checks for programs. AC_PROG_CC LT_INIT -LTVERSION="6:0:1" +LTVERSION="7:0:2" AC_SUBST(LTVERSION) AC_PROG_INSTALL @@ -3,7 +3,7 @@ scriptversion=2018-03-07.03; # UTC -# Copyright (C) 1999-2018 Free Software Foundation, Inc. +# Copyright (C) 1999-2020 Free Software Foundation, Inc. # This program is free software; you can redistribute it and/or modify # it under the terms of the GNU General Public License as published by @@ -1,4 +1,4 @@ -Oniguruma API Version 6.9.6 2020/07/12 +Oniguruma API Version 6.9.7 2021/03/03 #include <oniguruma.h> @@ -15,6 +15,10 @@ Oniguruma API Version 6.9.6 2020/07/12 1 use_encodings: array of encodings used in application. 2 num_encodings: number of encodings. + return value + normal: ONIG_NORMAL == 0 + error: error code < 0 + # int onig_error_code_to_str(UChar* err_buf, int err_code, ...) @@ -22,7 +26,8 @@ Oniguruma API Version 6.9.6 2020/07/12 If this function is used for onig_new(), don't call this after the pattern argument of onig_new() is freed. - normal return: error message string length + return value + normal: error message string length arguments 1 err_buf: error message string buffer. @@ -60,7 +65,9 @@ Oniguruma API Version 6.9.6 2020/07/12 Create a regex object. - normal return: ONIG_NORMAL + return value + normal: ONIG_NORMAL == 0 + error: error code < 0 arguments 1 reg: return regex object's address. @@ -75,14 +82,12 @@ Oniguruma API Version 6.9.6 2020/07/12 ONIG_OPTION_EXTEND extended pattern form ONIG_OPTION_FIND_LONGEST find longest match ONIG_OPTION_FIND_NOT_EMPTY ignore empty match - ONIG_OPTION_NEGATE_SINGLELINE - clear ONIG_OPTION_SINGLELINE which is enabled on - ONIG_SYNTAX_POSIX_BASIC, ONIG_SYNTAX_POSIX_EXTENDED, - ONIG_SYNTAX_PERL, ONIG_SYNTAX_PERL_NG, ONIG_SYNTAX_JAVA + ONIG_OPTION_NEGATE_SINGLELINE clear ONIG_OPTION_SINGLELINE which is enabled on ONIG_SYNTAX_POSIX_BASIC/POSIX_EXTENDED/PERL/PERL_NG/PYTHON/JAVA ONIG_OPTION_DONT_CAPTURE_GROUP only named group captured. ONIG_OPTION_CAPTURE_GROUP named and no-named group captured. + ONIG_OPTION_IGNORECASE_IS_ASCII Limit IGNORECASE((?i)) to a range of ASCII characters ONIG_OPTION_WORD_IS_ASCII ASCII only word (\w, \p{Word}, [[:word:]]) ASCII only word bound (\b) ONIG_OPTION_DIGIT_IS_ASCII ASCII only digit (\d, \p{Digit}, [[:digit:]]) @@ -145,6 +150,7 @@ Oniguruma API Version 6.9.6 2020/07/12 ONIG_SYNTAX_JAVA Java (Sun java.util.regex) ONIG_SYNTAX_PERL Perl ONIG_SYNTAX_PERL_NG Perl + named group + ONIG_SYNTAX_PYTHON Python ONIG_SYNTAX_ONIGURUMA Oniguruma ONIG_SYNTAX_DEFAULT default (== ONIG_SYNTAX_ONIGURUMA) onig_set_default_syntax() @@ -164,8 +170,9 @@ Oniguruma API Version 6.9.6 2020/07/12 Create a regex object. reg object area is not allocated in this function. - normal return: ONIG_NORMAL - + return value + normal: ONIG_NORMAL == 0 + error: error code < 0 # int onig_new_deluxe(regex_t** reg, const UChar* pattern, const UChar* pattern_end, @@ -177,7 +184,9 @@ Oniguruma API Version 6.9.6 2020/07/12 Create a regex object. This function is deluxe version of onig_new(). - normal return: ONIG_NORMAL + return value + normal: ONIG_NORMAL == 0 + error: error code < 0 arguments 1 reg: return address of regex object. @@ -319,9 +328,13 @@ Oniguruma API Version 6.9.6 2020/07/12 Search string and return search result and matching region. Do not pass invalid byte string in the regex character encoding. - normal return: match position offset (i.e. p - str >= 0) - not found: ONIG_MISMATCH (< 0) - error: error code (< 0) + return value + normal: match position offset (i.e. p - str >= 0) + not found: ONIG_MISMATCH (< 0) + error: error code (< 0) + + * If option ONIG_OPTION_CALLBACK_EACH_MATCH is used, + it will return ONIG_MISMATCH even if there is a match. arguments 1 reg: regex object @@ -334,11 +347,24 @@ Oniguruma API Version 6.9.6 2020/07/12 6 region: address for return group match range info (NULL is allowed) 7 option: search time option - ONIG_OPTION_NOTBOL (str) isn't considered as begin of line and begin of string (* ONIG_OPTION_NOT_BEGIN_STRING) - ONIG_OPTION_NOTEOL (end) isn't considered as end of line and end of string (* ONIG_OPTION_NOT_END_STRING) - ONIG_OPTION_NOT_BEGIN_STRING (str) isn't considered as begin of string (* fail \A) - ONIG_OPTION_NOT_END_STRING (end) isn't considered as end of string (* fail \z, \Z) - ONIG_OPTION_NOT_BEGIN_POSITION (start) isn't considered as start position of search (* fail \G) + ONIG_OPTION_NOTBOL Do not regard the beginning of the (str) as the beginning of the line and the beginning of the string + ONIG_OPTION_NOTEOL Do not regard the (end) as the end of a line and the end of a string + ONIG_OPTION_NOT_BEGIN_STRING Do not regard the beginning of the (str) as the beginning of a string (* fail \A) + ONIG_OPTION_NOT_END_STRING Do not regard the (end) as a string endpoint (* fail \z, \Z) + ONIG_OPTION_NOT_BEGIN_POSITION Do not regard the (start) as start position of search (* fail \G) + + ONIG_OPTION_CALLBACK_EACH_MATCH + Call back for all successful matches. + (including the case of the same matching start position) + The search does not stop when a match is found at a certain position. + The callback function to be called is set by + onig_set_callback_each_match(). + The user_data in the argument passed to the callback function is + specified by onig_set_callout_user_data_of_match_param(mp, user_data). + Therefore, if you want to specify user_data, + use onig_search_with_param() instead of onig_search(). + The user_data specified by onig_set_callout_user_data_of_match_param() + will be shared with callout. # int onig_search_with_param(regex_t* reg, const UChar* str, const UChar* end, @@ -359,9 +385,13 @@ Oniguruma API Version 6.9.6 2020/07/12 Match string and return result and matching region. Do not pass invalid byte string in the regex character encoding. - normal return: match length (>= 0) - not match: ONIG_MISMATCH (< 0) - error: error code (< 0) + return value + normal: match length (>= 0) + not match: ONIG_MISMATCH (< 0) + error: error code (< 0) + + * If option ONIG_OPTION_CALLBACK_EACH_MATCH is used, + it will return ONIG_MISMATCH even if there is a match. arguments 1 reg: regex object @@ -371,11 +401,12 @@ Oniguruma API Version 6.9.6 2020/07/12 5 region: address for return group match range info (NULL is allowed) 6 option: search time option - ONIG_OPTION_NOTBOL (str) isn't considered as begin of line and begin of string (* ONIG_OPTION_NOT_BEGIN_STRING) - ONIG_OPTION_NOTEOL (end) isn't considered as end of line and end of string (* ONIG_OPTION_NOT_END_STRING) - ONIG_OPTION_NOT_BEGIN_STRING (str) isn't considered as begin of string (* fail \A) - ONIG_OPTION_NOT_END_STRING (end) isn't considered as end of string (* fail \z, \Z) - ONIG_OPTION_NOT_BEGIN_POSITION (at) isn't considered as start position of search (* fail \G) + ONIG_OPTION_NOTBOL Do not regard the beginning of the (str) as the beginning of the line and the beginning of the string + ONIG_OPTION_NOTEOL Do not regard the (end) as the end of a line and the end of a string + ONIG_OPTION_NOT_BEGIN_STRING Do not regard the beginning of the (str) as the beginning of a string (* fail \A) + ONIG_OPTION_NOT_END_STRING Do not regard the (end) as a string endpoint (* fail \z, \Z) + ONIG_OPTION_NOT_BEGIN_POSITION Do not regard the (start) as start position of search (* fail \G) + ONIG_OPTION_CALLBACK_EACH_MATCH Call back for all successful matches. # int onig_match_with_param(regex_t* reg, const UChar* str, const UChar* end, @@ -398,9 +429,10 @@ Oniguruma API Version 6.9.6 2020/07/12 Scan string and callback with matching region. Do not pass invalid byte string in the regex character encoding. - normal return: number of matching times - error: error code - interruption: return value of callback function (!= 0) + return value + normal: number of matching times + error: error code + interruption: return value of callback function (!= 0) arguments 1 reg: regex object @@ -423,7 +455,9 @@ Oniguruma API Version 6.9.6 2020/07/12 2 n: number of regex in regs 3 regs: array of regex - normal return: ONIG_NORMAL + return value + normal: ONIG_NORMAL == 0 + error: error code < 0 # int onig_regset_add(OnigRegSet* set, regex_t* reg) @@ -436,7 +470,9 @@ Oniguruma API Version 6.9.6 2020/07/12 1 set: regset object 2 reg: regex object - normal return: ONIG_NORMAL + return value + normal: ONIG_NORMAL == 0 + error: error code < 0 # int onig_regset_replace(OnigRegSet* set, int at, regex_t* reg) @@ -449,7 +485,9 @@ Oniguruma API Version 6.9.6 2020/07/12 2 at: index of regex (zero origin) 3 reg: regex object - normal return: ONIG_NORMAL + return value + normal: ONIG_NORMAL == 0 + error: error code < 0 # void onig_regset_free(OnigRegSet* set) @@ -492,9 +530,9 @@ Oniguruma API Version 6.9.6 2020/07/12 Perform a search with regset. return value: - normal return: index of match regex (zero origin) - not found: ONIG_MISMATCH (< 0) - error: error code (< 0) + normal: index of match regex (zero origin) + not found: ONIG_MISMATCH (< 0) + error: error code (< 0) arguments 1 set: regset object @@ -507,11 +545,11 @@ Oniguruma API Version 6.9.6 2020/07/12 ONIG_REGSET_REGEX_LEAD (returns most left position) ONIG_REGSET_PRIORITY_TO_REGEX_ORDER (returns first match regex) 7 option: search time option - ONIG_OPTION_NOTBOL (str) isn't considered as begin of line and begin of string (* ONIG_OPTION_NOT_BEGIN_STRING) - ONIG_OPTION_NOTEOL end (end) isn't considered as end of line and end of string (* ONIG_OPTION_NOT_END_STRING) - ONIG_OPTION_NOT_BEGIN_STRING (str) isn't considered as begin of string (* fail \A) - ONIG_OPTION_NOT_END_STRING (end) isn't considered as end of string (* fail \z, \Z) - ONIG_OPTION_NOT_BEGIN_POSITION (start) isn't considered as start position of search (* fail \G) + ONIG_OPTION_NOTBOL Do not regard the beginning of the (str) as the beginning of the line and the beginning of the string + ONIG_OPTION_NOTEOL Do not regard the (end) as the end of a line and the end of a string + ONIG_OPTION_NOT_BEGIN_STRING Do not regard the beginning of the (str) as the beginning of a string (* fail \A) + ONIG_OPTION_NOT_END_STRING Do not regard the (end) as a string endpoint (* fail \z, \Z) + ONIG_OPTION_NOT_BEGIN_POSITION Do not regard the (start) as start position of search (* fail \G) 8 rmatch_pos: return address of match position (match_address - str) @@ -525,9 +563,9 @@ Oniguruma API Version 6.9.6 2020/07/12 Perform a search with regset and match-params. return value: - normal return: index of match regex (zero origin) - not found: ONIG_MISMATCH (< 0) - error: error code (< 0) + normal: index of match regex (zero origin) + not found: ONIG_MISMATCH (< 0) + error: error code (< 0) arguments 1 set: regset object @@ -540,11 +578,11 @@ Oniguruma API Version 6.9.6 2020/07/12 ONIG_REGSET_REGEX_LEAD (returns most left position) ONIG_REGSET_PRIORITY_TO_REGEX_ORDER (returns first match regex) 7 option: search time option - ONIG_OPTION_NOTBOL (str) isn't considered as begin of line and begin of string (* ONIG_OPTION_NOT_BEGIN_STRING) - ONIG_OPTION_NOTEOL (end) isn't considered as end of line and end of string (* ONIG_OPTION_NOT_END_STRING) - ONIG_OPTION_NOT_BEGIN_STRING (str) isn't considered as begin of string (* fail \A) - ONIG_OPTION_NOT_END_STRING (end) isn't considered as end of string (* fail \z, \Z) - ONIG_OPTION_NOT_BEGIN_POSITION (start) isn't considered as start position of search (* fail \G) + ONIG_OPTION_NOTBOL Do not regard the beginning of the (str) as the beginning of the line and the beginning of the string + ONIG_OPTION_NOTEOL Do not regard the (end) as the end of a line and the end of a string + ONIG_OPTION_NOT_BEGIN_STRING Do not regard the beginning of the (str) as the beginning of a string (* fail \A) + ONIG_OPTION_NOT_END_STRING Do not regard the (end) as a string endpoint (* fail \z, \Z) + ONIG_OPTION_NOT_BEGIN_POSITION Do not regard the (start) as start position of search (* fail \G) 8 mps: array of match-params 9 rmatch_pos: return address of match position (match_address - str) @@ -585,7 +623,9 @@ Oniguruma API Version 6.9.6 2020/07/12 Resize group range area of region. - normal return: ONIG_NORMAL + return value + normal: ONIG_NORMAL == 0 + error: error code < 0 arguments 1 region: target region @@ -598,8 +638,9 @@ Oniguruma API Version 6.9.6 2020/07/12 Return the group number list of the name. Named subexp is defined by (?<name>....). - normal return: number of groups for the name. - (ex. /(?<x>..)(?<x>..)/ ==> 2) + return value + normal: number of groups for the name. + (ex. /(?<x>..)(?<x>..)/ ==> 2) name not found: -1 arguments @@ -616,7 +657,9 @@ Oniguruma API Version 6.9.6 2020/07/12 If two or more regions for the groups of the name are effective, the greatest number in it is obtained. - normal return: group number. + return value + normal: group number + error: error code < 0 arguments 1 reg: regex object. @@ -631,8 +674,9 @@ Oniguruma API Version 6.9.6 2020/07/12 Iterate function call for all names. - normal return: 0 - error: func's return value. + return value + normal: 0 + error: return value of callback function arguments 1 reg: regex object. @@ -654,7 +698,6 @@ Oniguruma API Version 6.9.6 2020/07/12 # OnigEncoding onig_get_encoding(regex_t* reg) # OnigOptionType onig_get_options(regex_t* reg) -# OnigCaseFoldType onig_get_case_fold_flag(regex_t* reg) # OnigSyntaxType* onig_get_syntax(regex_t* reg) Return a value of the regex object. @@ -663,6 +706,15 @@ Oniguruma API Version 6.9.6 2020/07/12 1 reg: regex object. +# OnigCaseFoldType onig_get_case_fold_flag(regex_t* reg) + + Return the case_fold_flag of the regex object. + This function is deprecated. + + arguments + 1 reg: regex object. + + # int onig_number_of_captures(regex_t* reg) Return the number of capture group in the pattern. @@ -671,6 +723,23 @@ Oniguruma API Version 6.9.6 2020/07/12 1 reg: regex object. +# OnigCallbackEachMatchFunc onig_get_callback_each_match(void) + + Return the current callback function for ONIG_OPTION_CALLBACK_EACH_MATCH. + + +# int onig_set_callback_each_match(OnigCallbackEachMatchFunc func) + + Set the callback function for ONIG_OPTION_CALLBACK_EACH_MATCH. + If NULL is set, the callback will never be executed. + + return value + normal: 0 + + arguments + 1 func: callback function + + # int onig_number_of_capture_histories(regex_t* reg) Return the number of capture history defined in the pattern. @@ -682,7 +751,6 @@ Oniguruma API Version 6.9.6 2020/07/12 1 reg: regex object. - # OnigCaptureTreeNode* onig_get_capture_tree(OnigRegion* region) Return the root node of capture history data tree. @@ -698,8 +766,9 @@ Oniguruma API Version 6.9.6 2020/07/12 Traverse and callback in capture history data tree. - normal return: 0 - error: callback func's return value. + return value + normal: 0 + error: return value of callback function arguments 1 region: match region data. @@ -732,6 +801,7 @@ Oniguruma API Version 6.9.6 2020/07/12 Return noname group capture activity. + return value active: 1 inactive: 0 @@ -868,11 +938,13 @@ Oniguruma API Version 6.9.6 2020/07/12 # OnigCaseFoldType onig_get_default_case_fold_flag() Get default case fold flag. + This function is deprecated. # int onig_set_default_case_fold_flag(OnigCaseFoldType case_fold_flag) Set default case fold flag. + This function is deprecated. 1 case_fold_flag: case fold flag @@ -993,7 +1065,9 @@ Oniguruma API Version 6.9.6 2020/07/12 * Don't destroy the ranges after having called this function. - normal return: ONIG_NORMAL + return value + normal: ONIG_NORMAL == 0 + error: error code < 0 # unsigned int onig_get_parse_depth_limit(void) @@ -1,4 +1,4 @@ -鬼車インターフェース Version 6.9.6 2020/07/12 +鬼車インターフェース Version 6.9.7 2021/03/16 #include <oniguruma.h> @@ -14,6 +14,10 @@ 1 use_encodings: 使用する文字エンコーディングの配列 2 num_encodings: 文字エンコーディングの数 + 戻り値 + 正常終了: ONIG_NORMAL == 0 + エラー時: エラーコード < 0 + # int onig_error_code_to_str(UChar* err_buf, int err_code, ...) @@ -22,7 +26,8 @@ この関数を、onig_new()の結果に対して呼び出す場合には、onig_new()のpattern引数を メモリ解放するよりも前に呼び出さなければならない。 - 正常終了戻り値: エラーメッセージ文字列のバイト長 + 戻り値 + 正常終了: エラーメッセージ文字列のバイト長 引数 1 err_buf: エラーメッセージを格納する領域 @@ -60,7 +65,9 @@ 正規表現オブジェクト(regex)を作成する。 - 正常終了戻り値: ONIG_NORMAL + 戻り値 + 正常終了: ONIG_NORMAL == 0 + エラー時: エラーコード < 0 引数 1 reg: 作成された正規表現オブジェクトを返すアドレス @@ -75,13 +82,11 @@ ONIG_OPTION_EXTEND パターン拡張形式 ONIG_OPTION_FIND_LONGEST 最長マッチ ONIG_OPTION_FIND_NOT_EMPTY 空マッチを無視 - ONIG_OPTION_NEGATE_SINGLELINE - ONIG_SYNTAX_POSIX_BASIC, ONIG_SYNTAX_POSIX_EXTENDED, - ONIG_SYNTAX_PERL, ONIG_SYNTAX_PERL_NG, ONIG_SYNTAX_JAVAで - デフォルトで有効なONIG_OPTION_SINGLELINEをクリアする。 + ONIG_OPTION_NEGATE_SINGLELINE ONIG_SYNTAX_POSIX_BASIC/POSIX_EXTENDED/PERL/PERL_NG/PYTHON/JAVAでデフォルトで有効なONIG_OPTION_SINGLELINEをクリアする。 ONIG_OPTION_DONT_CAPTURE_GROUP 名前付き捕獲式集合のみ捕獲 ONIG_OPTION_CAPTURE_GROUP 名前無し捕獲式集合も捕獲 + ONIG_OPTION_IGNORECASE_IS_ASCII IGNORECASE((?i))をASCII文字の範囲に制限する ONIG_OPTION_WORD_IS_ASCII wordがASCIIのみ (\w, \p{Word}, [[:word:]]) word boundがASCIIのみ (\b) ONIG_OPTION_DIGIT_IS_ASCII digitがASCIIのみ (\d, \p{Digit}, [[:digit:]]) @@ -142,6 +147,7 @@ ONIG_SYNTAX_JAVA Java (Sun java.util.regex) ONIG_SYNTAX_PERL Perl ONIG_SYNTAX_PERL_NG Perl + 名前付き捕獲式集合 + ONIG_SYNTAX_PYTHON Python ONIG_SYNTAX_ONIGURUMA Oniguruma ONIG_SYNTAX_DEFAULT default (== ONIG_SYNTAX_ONIGURUMA) onig_set_default_syntax() @@ -161,20 +167,24 @@ 正規表現オブジェクト(regex)を作成する。 regの領域を内部で割り当てない。 - 正常終了戻り値: ONIG_NORMAL + 戻り値 + 正常終了: ONIG_NORMAL == 0 + エラー時: エラーコード < 0 # int onig_new_deluxe(regex_t** reg, const UChar* pattern, const UChar* pattern_end, OnigCompileInfo* ci, OnigErrorInfo* einfo) - この関数は廃止予定。 + この関数は廃止(使用不可)。 パターンと対象文字列の文字エンコーディングが異なる場合を許さなくなった。 正規表現オブジェクト(regex)を作成する。 この関数は、onig_new()のデラックス版。 - 正常終了戻り値: ONIG_NORMAL + 戻り値 + 正常終了: ONIG_NORMAL == 0 + エラー時: エラーコード < 0 引数 1 reg: 作成された正規表現オブジェクトを返すアドレス @@ -316,8 +326,11 @@ 正規表現で文字列を検索し、検索結果とマッチ領域を返す。 正規表現オブジェクトの文字エンコーディングで、検索文字列として不正な文字列を渡してはいけない。 - 正常終了戻り値: マッチ位置 (p - str >= 0) - 検索失敗: ONIG_MISMATCH (< 0) + 戻り値 + 正常終了: マッチ位置 (p - str >= 0) + 検索失敗: ONIG_MISMATCH (< 0) + + * 若しONIG_OPTION_CALLBACK_EACH_MATCHが使用されると、マッチするものがあってもONIG_MISMATCHが返される。 引数 1 reg: 正規表現オブジェクト @@ -330,12 +343,24 @@ 6 region: マッチ領域情報(region) (NULLも許される) 7 option: 検索時オプション - ONIG_OPTION_NOTBOL strを行頭および文字列先頭と看做さない + ONIG_OPTION_NOTBOL strの先頭を行頭および文字列先頭と看做さない ONIG_OPTION_NOTEOL endを行末および文字列終端と看做さない - ONIG_OPTION_NOT_BEGIN_STRING strを文字列の先頭と看做さない (\A 失敗) - ONIG_OPTION_NOT_END_STRING end文字列終端と看做さない (\z, \Z 失敗) + ONIG_OPTION_NOT_BEGIN_STRING strの先頭を文字列の先頭と看做さない (\A 失敗) + ONIG_OPTION_NOT_END_STRING endを文字列終端と看做さない (\z, \Z 失敗) ONIG_OPTION_NOT_BEGIN_POSITION startを検索開始位置と看做さない (\G 失敗) + ONIG_OPTION_CALLBACK_EACH_MATCH + 全てのマッチ成功に対してコールバック関数が呼び出される。 + (マッチ開始位置が同じものも含めて) + ある位置でマッチするものが見つかっても探索が止まることはない。 + 呼び出されるコールバック関数は、onig_set_callback_each_match()で与える。 + コールバック関数に渡される引数の中のuser_dataは、 + onig_set_callout_user_data_of_match_param(mp, user_data)で指定する。 + このため、user_dataを指定したい場合には、onig_search()ではなく、 + onig_search_with_param()を使用することになる。 + onig_set_callout_user_data_of_match_param()で指定するuser_dataは、 + calloutで使用されるuser_dataと共用される。 + # int onig_search_with_param(regex_t* reg, const UChar* str, const UChar* end, const UChar* start, const UChar* range, OnigRegion* region, @@ -355,8 +380,11 @@ 文字列の指定位置でマッチングを行い、結果とマッチ領域を返す。 正規表現オブジェクトの文字エンコーディングで、検索文字列として不正な文字列を渡してはいけない。 - 正常終了戻り値: マッチしたバイト長 (>= 0) - not match: ONIG_MISMATCH ( < 0) + 戻り値 + 正常終了: マッチしたバイト長 (>= 0) + not match: ONIG_MISMATCH ( < 0) + + * 若しONIG_OPTION_CALLBACK_EACH_MATCHが使用されると、マッチするものがあってもONIG_MISMATCHが返される。 引数 1 reg: 正規表現オブジェクト @@ -366,11 +394,12 @@ 5 region: マッチ領域情報(region) (NULLも許される) 6 option: 検索時オプション - ONIG_OPTION_NOTBOL strを行頭および文字列先頭と看做さない + ONIG_OPTION_NOTBOL strの先頭を行頭および文字列先頭と看做さない ONIG_OPTION_NOTEOL endを行末および文字列終端と看做さない - ONIG_OPTION_NOT_BEGIN_STRING strを文字列の先頭と看做さない (\A 失敗) - ONIG_OPTION_NOT_END_STRING end文字列終端と看做さない (\z, \Z 失敗) - ONIG_OPTION_NOT_BEGIN_POSITION atを検索開始位置と看做さない (\G 失敗) + ONIG_OPTION_NOT_BEGIN_STRING strの先頭を文字列の先頭と看做さない (\A 失敗) + ONIG_OPTION_NOT_END_STRING endを文字列終端と看做さない (\z, \Z 失敗) + ONIG_OPTION_NOT_BEGIN_POSITION startを検索開始位置と看做さない (\G 失敗) + ONIG_OPTION_CALLBACK_EACH_MATCH 全てのマッチ成功に対してコールバック関数が呼び出される。 # int onig_match_with_param(regex_t* reg, const UChar* str, const UChar* end, @@ -393,6 +422,7 @@ 正規表現で文字列をスキャンして、マッチングする毎にコールバック関数を呼び出す。 正規表現オブジェクトの文字エンコーディングで、検索文字列として不正な文字列を渡してはいけない。 + 戻り値 正常終了: マッチ回数 (0回も含める) エラー: エラーコード (< 0) 中断: コールバック関数が0以外の戻り値を返したとき、その値を戻り値として中断 @@ -418,7 +448,9 @@ 2 n: 正規表現の個数 3 regs: 正規表現オブジェクトの配列 - 正常終了戻り値: ONIG_NORMAL + 戻り値 + 正常終了: ONIG_NORMAL == 0 + エラー時: エラーコード < 0 # int onig_regset_add(OnigRegSet* set, regex_t* reg) @@ -431,7 +463,9 @@ 1 set: regsetオブジェクト 2 reg: 正規表現オブジェクト - 正常終了戻り値: ONIG_NORMAL + 戻り値 + 正常終了: ONIG_NORMAL == 0 + エラー時: エラーコード < 0 # int onig_regset_replace(OnigRegSet* set, int at, regex_t* reg) @@ -444,7 +478,9 @@ 2 at: 変更する場所のインデックス 2 reg: 正規表現オブジェクト - 正常終了戻り値: ONIG_NORMAL + 戻り値 + 正常終了: ONIG_NORMAL == 0 + エラー時: エラーコード < 0 # void onig_regset_free(OnigRegSet* set) @@ -486,10 +522,10 @@ regsetによる検索を実行する。 - 戻り値: - 検索成功: マッチした正規表現オブジェクトのインデックス (ゼロ開始) - 検索失敗: ONIG_MISMATCH (< 0) - エラー: エラーコード (< 0) + 戻り値 + 検索成功: マッチした正規表現オブジェクトのインデックス (ゼロ開始) + 検索失敗: ONIG_MISMATCH (< 0) + エラー: エラーコード (< 0) 引数 1 set: regsetオブジェクト @@ -503,10 +539,10 @@ ONIG_REGSET_REGEX_LEAD (最左位置でマッチした結果を返す) ONIG_REGSET_PRIORITY_TO_REGEX_ORDER (最初にマッチした正規表現の結果を返す) 7 option: 検索時オプション - ONIG_OPTION_NOTBOL strを行頭および文字列先頭と看做さない + ONIG_OPTION_NOTBOL strの先頭を行頭および文字列先頭と看做さない ONIG_OPTION_NOTEOL endを行末および文字列終端と看做さない - ONIG_OPTION_NOT_BEGIN_STRING strを文字列の先頭と看做さない (\A 失敗) - ONIG_OPTION_NOT_END_STRING end文字列終端と看做さない (\z, \Z 失敗) + ONIG_OPTION_NOT_BEGIN_STRING strの先頭を文字列の先頭と看做さない (\A 失敗) + ONIG_OPTION_NOT_END_STRING endを文字列終端と看做さない (\z, \Z 失敗) ONIG_OPTION_NOT_BEGIN_POSITION startを検索開始位置と看做さない (\G 失敗) 8 rmatch_pos: マッチした位置を返すためのアドレス (match_address - str) @@ -519,10 +555,10 @@ regsetとOnigMatchParamオブジェクトによる検索を実行する。 - 戻り値: - 検索成功: マッチした正規表現オブジェクトのインデックス (ゼロ開始) - 検索失敗: ONIG_MISMATCH (< 0) - エラー: エラーコード (< 0) + 戻り値 + 検索成功: マッチした正規表現オブジェクトのインデックス (ゼロ開始) + 検索失敗: ONIG_MISMATCH (< 0) + エラー: エラーコード (< 0) 引数 1 set: regsetオブジェクト @@ -536,10 +572,10 @@ ONIG_REGSET_REGEX_LEAD (最左位置でマッチした結果を返す) ONIG_REGSET_PRIORITY_TO_REGEX_ORDER (最初にマッチした正規表現の結果を返す) 7 option: 検索時オプション - ONIG_OPTION_NOTBOL strを行頭および文字列先頭と看做さない + ONIG_OPTION_NOTBOL strの先頭を行頭および文字列先頭と看做さない ONIG_OPTION_NOTEOL endを行末および文字列終端と看做さない - ONIG_OPTION_NOT_BEGIN_STRING strを文字列の先頭と看做さない (\A 失敗) - ONIG_OPTION_NOT_END_STRING end文字列終端と看做さない (\z, \Z 失敗) + ONIG_OPTION_NOT_BEGIN_STRING strの先頭を文字列の先頭と看做さない (\A 失敗) + ONIG_OPTION_NOT_END_STRING endを文字列終端と看做さない (\z, \Z 失敗) ONIG_OPTION_NOT_BEGIN_POSITION startを検索開始位置と看做さない (\G 失敗) 8 mps: OnigMatchParamオブジェクトの配列 9 rmatch_pos: マッチした位置を返すためのアドレス (match_address - str) @@ -580,7 +616,9 @@ マッチ領域情報(region)の捕獲式集合(グループ)数を変更する。 - 正常終了戻り値: ONIG_NORMAL + 戻り値 + 正常終了: ONIG_NORMAL == 0 + エラー時: エラーコード < 0 引数 1 region: 対象領域 @@ -594,8 +632,9 @@ グループ番号リストを返す。 名前付き捕獲式集合は、(?<name>....)によって定義できる。 - 正常終了戻り値: 指定された名前に対するグループ数 - (例 /(?<x>..)(?<x>..)/ ==> 2) + 戻り値 + 正常終了: 指定された名前に対するグループ数 + (例 /(?<x>..)(?<x>..)/ ==> 2) 名前に対するグループが存在しない: -1 引数 @@ -613,7 +652,9 @@ 名前に対する捕獲式集合が一個しかないときには、対応するマッチ領域が有効か どうかに関係なく、その番号を返す。(従って、regionにはNULLを渡してもよい。) - 正常終了戻り値: 番号 + 戻り値 + 正常終了: 番号 + エラー時: エラーコード < 0 引数 1 reg: 正規表現オブジェクト @@ -628,8 +669,9 @@ 全ての名前に対してコールバック関数呼び出しを実行する。 - 正常終了戻り値: 0 - エラー: コールバック関数の戻り値 + 戻り値 + 正常終了: ONIG_NORMAL == 0 + エラー時: コールバック関数の戻り値 引数 1 reg: 正規表現オブジェクト @@ -654,7 +696,6 @@ # OnigEncoding onig_get_encoding(regex_t* reg) # OnigOptionType onig_get_options(regex_t* reg) -# OnigCaseFoldType onig_get_case_fold_flag(regex_t* reg) # OnigSyntaxType* onig_get_syntax(regex_t* reg) 正規表現オブジェクトに対して、対応する値を返す。 @@ -663,6 +704,15 @@ 1 reg: 正規表現オブジェクト +# OnigCaseFoldType onig_get_case_fold_flag(regex_t* reg) + + 正規表現オブジェクトに対して、case_fold_flag値を返す。 + この関数は廃止予定(非推奨)。 + + 引数 + 1 reg: 正規表現オブジェクト + + # int onig_number_of_captures(regex_t* reg) パターン中で定義された捕獲グループの数を返す。 @@ -671,6 +721,23 @@ 1 reg: 正規表現オブジェクト +# OnigCallbackEachMatchFunc onig_get_callback_each_match(void) + + ONIG_OPTION_CALLBACK_EACH_MATCHに対する現在のコールバック関数を返す。 + + +# int onig_set_callback_each_match(OnigCallbackEachMatchFunc func) + + ONIG_OPTION_CALLBACK_EACH_MATCHに対するコールバック関数をセットする。 + 若しNULLがセットされると、コールバックは実行されない。 + + 戻り値 + 正常終了: ONIG_NORMAL == 0 + + 引数 + 1 func: コールバック関数 + + # int onig_number_of_capture_histories(regex_t* reg) パターン中で定義された捕獲履歴(?@...)の数を返す。 @@ -697,8 +764,9 @@ 捕獲履歴データ木を巡回してコールバックする。 - 正常終了戻り値: 0 - エラー: コールバック関数の戻り値 + 戻り値 + 正常終了: ONIG_NORMAL == 0 + エラー時: コールバック関数の戻り値 引数 1 region: マッチ領域 @@ -733,6 +801,7 @@ 名前なし式集合の捕獲機能が有効かどうかを返す。 + 戻り値 有効: 1 無効: 0 @@ -869,11 +938,13 @@ # OnigCaseFoldType onig_get_default_case_fold_flag() デフォルトのcase foldフラグを取得する。 + この関数は廃止予定(非推奨)。 # int onig_set_default_case_fold_flag(OnigCaseFoldType case_fold_flag) デフォルトのcase foldフラグをセットする。 + この関数は廃止予定(非推奨)。 引数 1 case_fold_flag: case foldフラグ @@ -996,7 +1067,9 @@ * この関数を呼んだ後で、rangesを変更/破壊しないこと - 正常終了戻り値: ONIG_NORMAL + 戻り値 + 正常終了: ONIG_NORMAL == 0 + エラー時: エラーコード < 0 # unsigned int onig_get_parse_depth_limit(void) diff --git a/doc/CALLOUTS.API b/doc/CALLOUTS.API index c4a13c8..7ecf6bc 100644 --- a/doc/CALLOUTS.API +++ b/doc/CALLOUTS.API @@ -1,4 +1,4 @@ -Callouts API Version 6.8.2 2018/06/08 +Callouts API Version 6.9.7 2021/03/21 #include <oniguruma.h> @@ -273,48 +273,51 @@ Callouts API Version 6.8.2 2018/06/08 Returns the callout data value/type for a callout slot indicated by callout_num/slot. - normal return: ONIG_NORMAL - 1: not yet set (type is ONIG_TYPE_VOID) - < 0: error code + ONIG_NORMAL: normal return + ONIG_VALUE_IS_NOT_SET: value is not set / type is ONIG_TYPE_VOID + < 0: error code # int onig_get_callout_data_by_callout_args_self(OnigCalloutArgs* args, int slot, OnigType* type, OnigValue* val) Returns self callout data value/type. - normal return: ONIG_NORMAL - 1: not yet set (type is ONIG_TYPE_VOID) - < 0: error code + ONIG_NORMAL: normal return + ONIG_VALUE_IS_NOT_SET: value is not set / type is ONIG_TYPE_VOID + < 0: error code # int onig_set_callout_data_by_callout_args(OnigCalloutArgs* args, int callout_num, int slot, OnigType type, OnigValue* val) Set the callout data value/type for a callout slot indicated by callout_num/slot. - normal return: ONIG_NORMAL - < 0: error code + ONIG_NORMAL: normal return + < 0: error code # int onig_set_callout_data_by_callout_args_self(OnigCalloutArgs* args, int slot, OnigType type, OnigValue* val) Set self callout data value/type for a callout slot indicated by slot. - normal return: ONIG_NORMAL - < 0: error code + ONIG_NORMAL: normal return + < 0: error code # int onig_get_callout_data_by_callout_args_self_dont_clear_old(OnigCalloutArgs* args, int slot, OnigType* type, OnigValue* val) This function is almost same as onig_get_callout_data_by_callout_args_self(). - But this function doesn't clear values which set in previous failed match process. - Other onig_get_callout_data_xxxx() functions clear all values which set - in previous failed match process. + But this function does not clear the value set in the collation position before the current position. (dont_clear_old) + The other onig_get_callout_data_xxxx() function clears the value set in the collation process of the previous position. For example, Builtin callout (*TOTAL_COUNT) is implemented by using this function for accumulate count of all of match processes in a search process. Builtin callout (*COUNT) returns count in last success match process only, because it doesn't use this function. + ONIG_NORMAL: normal return + ONIG_VALUE_IS_NOT_SET: value is not set / type is ONIG_TYPE_VOID + < 0: error code + (8) Callout data (used in applications) @@ -323,34 +326,34 @@ Callouts API Version 6.8.2 2018/06/08 Returns the callout data value/type for a callout slot indicated by callout_num/slot. - normal return: ONIG_NORMAL - 1: not yet set (type is ONIG_TYPE_VOID) - < 0: error code + ONIG_NORMAL: normal return + ONIG_VALUE_IS_NOT_SET: value is not set / type is ONIG_TYPE_VOID + < 0: error code # int onig_get_callout_data_by_tag(OnigRegex reg, OnigMatchParam* mp, const OnigUChar* tag, const OnigUChar* tag_end, int slot, OnigType* type, OnigValue* val) Returns the callout data value/type for a callout slot indicated by tag/slot. - normal return: ONIG_NORMAL - 1: not yet set (type is ONIG_TYPE_VOID) - < 0: error code + ONIG_NORMAL: normal return + ONIG_VALUE_IS_NOT_SET: value is not set / type is ONIG_TYPE_VOID + < 0: error code # int onig_set_callout_data(OnigRegex reg, OnigMatchParam* mp, int callout_num, int slot, OnigType type, OnigValue* val) Set the callout data value/type for a callout slot indicated by callout_num/slot. - normal return: ONIG_NORMAL - < 0: error code + ONIG_NORMAL: normal return + < 0: error code # int onig_set_callout_data_by_tag(OnigRegex reg, OnigMatchParam* mp, const OnigUChar* tag, const OnigUChar* tag_end, int slot, OnigType type, OnigValue* val) Set the callout data value/type for a callout slot indicated by tag/slot. - normal return: ONIG_NORMAL - < 0: error code + ONIG_NORMAL: normal return + < 0: error code # int onig_get_callout_data_dont_clear_old(OnigRegex reg, OnigMatchParam* mp, int callout_num, int slot, OnigType* type, OnigValue* val) @@ -359,6 +362,15 @@ Callouts API Version 6.8.2 2018/06/08 It will be abolished. +# int onig_get_callout_data_by_tag_dont_clear_old(regex_t* reg, OnigMatchParam* mp, const UChar* tag, const UChar* tag_end, int slot, OnigType* type, OnigValue* val) + + Returns the callout data value/type for a callout slot indicated by tag/slot. + This function does not clear the value set in the collation position before the current position. (dont_clear_old) + + ONIG_NORMAL: 正常終了 + ONIG_VALUE_IS_NOT_SET: 値が未セット / 型がVOID + < 0: エラーコード + (9) Miscellaneous functions diff --git a/doc/CALLOUTS.API.ja b/doc/CALLOUTS.API.ja index c56555a..028e07a 100644 --- a/doc/CALLOUTS.API.ja +++ b/doc/CALLOUTS.API.ja @@ -1,4 +1,4 @@ -Callouts API Version 6.8.2 2018/06/08 +Callouts API Version 6.9.7 2021/03/21 #include <oniguruma.h> @@ -269,49 +269,50 @@ Callouts API Version 6.8.2 2018/06/08 callout_num/slotによって示された呼び出しスロットに対するデータの値/型を返す。 - 正常終了: ONIG_NORMAL - 1: 値が未セット (typeは ONIG_TYPE_VOID) - < 0: エラーコード + ONIG_NORMAL: 正常終了 + ONIG_VALUE_IS_NOT_SET: 値が未セット / 型がVOID + < 0: エラーコード # int onig_get_callout_data_by_callout_args_self(OnigCalloutArgs* args, int slot, OnigType* type, OnigValue* val) 自分自身の呼び出しのslotによって示されたスロットに対するデータの値/型を返す。 - 正常終了: ONIG_NORMAL - 1: 値が未セット (typeは ONIG_TYPE_VOID) - < 0: エラーコード + ONIG_NORMAL: 正常終了 + ONIG_VALUE_IS_NOT_SET: 値が未セット / 型がVOID + < 0: エラーコード # int onig_set_callout_data_by_callout_args(OnigCalloutArgs* args, int callout_num, int slot, OnigType type, OnigValue* val) callout_num/slotによって示された呼び出しスロットに対する値/型をセットする。。 - 正常終了: ONIG_NORMAL - < 0: エラーコード + ONIG_NORMAL: 正常終了 + < 0: エラーコード # int onig_set_callout_data_by_callout_args_self(OnigCalloutArgs* args, int slot, OnigType type, OnigValue* val) 自分自身の呼び出しのslotによって示されたスロットに対する値/型をセットする。。 - 正常終了: ONIG_NORMAL - < 0: エラーコード + ONIG_NORMAL: 正常終了 + < 0: エラーコード # int onig_get_callout_data_by_callout_args_self_dont_clear_old(OnigCalloutArgs* args, int slot, OnigType* type, OnigValue* val) この関数は、onig_get_callout_data_by_callout_args_self()とほぼ同じである。 - しかしこの関数は、現在の照合処理以前の失敗した照合処理の中でセットされた値を - クリアしない。 - 他のonig_get_callout_data_xxxx()関数は、以前の失敗した照合処理の中でセットされた値を - クリアする。 + しかしこの関数は、現在の位置より以前の照合位置の中でセットされた値をクリアしない。(dont_clear_old) + 他のonig_get_callout_data_xxxx()関数は、以前の位置の照合処理の中でセットされた値をクリアする。 例えば、組み込み呼び出し(*TOTAL_COUNT)は、検索処理の中の全ての照合処理の積算カウントを 得るためにこの関数を使用して実装されている。 組み込む呼び出し(*COUNT)は、この関数を使用しないので、最後の成功した照合処理だけの カウントを返す。 + ONIG_NORMAL: 正常終了 + ONIG_VALUE_IS_NOT_SET: 値が未セット / 型がVOID + < 0: エラーコード (8) 呼び出しデータ (アプリケーションから使用される) @@ -320,34 +321,34 @@ Callouts API Version 6.8.2 2018/06/08 callout_num/slotによって示された呼び出しスロットに対するデータの値/型を返す。 - 正常終了: ONIG_NORMAL - 1: 値が未セット (typeは ONIG_TYPE_VOID) - < 0: エラーコード + ONIG_NORMAL: 正常終了 + ONIG_VALUE_IS_NOT_SET: 値が未セット / 型がVOID + < 0: エラーコード # int onig_get_callout_data_by_tag(OnigRegex reg, OnigMatchParam* mp, const OnigUChar* tag, const OnigUChar* tag_end, int slot, OnigType* type, OnigValue* val) tag/slotによって示された呼び出しスロットに対するデータの値/型を返す。 - 正常終了: ONIG_NORMAL - 1: 値が未セット (typeは ONIG_TYPE_VOID) - < 0: エラーコード + ONIG_NORMAL: 正常終了 + ONIG_VALUE_IS_NOT_SET: 値が未セット / 型がVOID + < 0: エラーコード # int onig_set_callout_data(OnigRegex reg, OnigMatchParam* mp, int callout_num, int slot, OnigType type, OnigValue* val) callout_num/slotによって示された呼び出しスロットに対する値/型をセットする。。 - 正常終了: ONIG_NORMAL - < 0: エラーコード + ONIG_NORMAL: 正常終了 + < 0: エラーコード # int onig_set_callout_data_by_tag(OnigRegex reg, OnigMatchParam* mp, const OnigUChar* tag, const OnigUChar* tag_end, int slot, OnigType type, OnigValue* val) - tag/slotによって示された呼び出しスロットに対する値/型をセットする。。 + tag/slotによって示された呼び出しスロットに対する値/型をセットする。 - 正常終了: ONIG_NORMAL - < 0: エラーコード + ONIG_NORMAL: 正常終了 + < 0: エラーコード # int onig_get_callout_data_dont_clear_old(OnigRegex reg, OnigMatchParam* mp, int callout_num, int slot, OnigType* type, OnigValue* val) @@ -356,6 +357,16 @@ Callouts API Version 6.8.2 2018/06/08 廃止予定。 +# int onig_get_callout_data_by_tag_dont_clear_old(regex_t* reg, OnigMatchParam* mp, const UChar* tag, const UChar* tag_end, int slot, OnigType* type, OnigValue* val) + + tag/slotによって示された呼び出しスロットに対するデータの値/型を返す。 + この関数は、現在の位置より以前の照合位置の中でセットされた値をクリアしない。 + (dont_clear_old) + + ONIG_NORMAL: 正常終了 + ONIG_VALUE_IS_NOT_SET: 値が未セット / 型がVOID + < 0: エラーコード + (9) その他の関数 diff --git a/harnesses/base.c b/harnesses/base.c index 1206217..70f98f7 100644 --- a/harnesses/base.c +++ b/harnesses/base.c @@ -1,6 +1,6 @@ /* * base.c contributed by Mark Griffin - * Copyright (c) 2019-2020 K.Kosako + * Copyright (c) 2019-2021 K.Kosako */ #include <stdio.h> #include <unistd.h> @@ -12,23 +12,31 @@ #include <time.h> #include "oniguruma.h" -#define PARSE_DEPTH_LIMIT 8 -#define CALL_MAX_NEST_LEVEL 8 -#define SUBEXP_CALL_LIMIT 500 -#define BASE_RETRY_LIMIT 20000 -#define BASE_LENGTH 2048 -#define MATCH_STACK_LIMIT 10000000 -#define MAX_REM_SIZE 1048576 -#define MAX_SLOW_REM_SIZE 1024 -#define SLOW_RETRY_LIMIT 2000 - -//#define EXEC_PRINT_INTERVAL 500000 -//#define DUMP_DATA_INTERVAL 100000 -//#define STAT_PATH "fuzzer.stat_log" - -#define OPTIONS_AT_COMPILE (ONIG_OPTION_IGNORECASE | ONIG_OPTION_EXTEND | ONIG_OPTION_MULTILINE | ONIG_OPTION_SINGLELINE | ONIG_OPTION_FIND_LONGEST | ONIG_OPTION_FIND_NOT_EMPTY | ONIG_OPTION_NEGATE_SINGLELINE | ONIG_OPTION_DONT_CAPTURE_GROUP | ONIG_OPTION_CAPTURE_GROUP | ONIG_OPTION_WORD_IS_ASCII | ONIG_OPTION_DIGIT_IS_ASCII | ONIG_OPTION_SPACE_IS_ASCII | ONIG_OPTION_POSIX_IS_ASCII | ONIG_OPTION_TEXT_SEGMENT_EXTENDED_GRAPHEME_CLUSTER | ONIG_OPTION_TEXT_SEGMENT_WORD ) +#define PARSE_DEPTH_LIMIT 8 +#define MAX_SUBEXP_CALL_NEST_LEVEL 8 +#define SUBEXP_CALL_LIMIT 1000 +#define BASE_RETRY_LIMIT 20000 +#define BASE_LENGTH 2048 +#define MATCH_STACK_LIMIT 10000000 +#define MAX_REM_SIZE 1048576 +#define MAX_SLOW_REM_SIZE 1024 +#define MAX_SLOW_REM_SIZE2 100 +#define SLOW_RETRY_LIMIT 2000 +#define SLOW_SUBEXP_CALL_LIMIT 100 +#define MAX_SLOW_BACKWARD_REM_SIZE 200 + +//#define EXEC_PRINT_INTERVAL 500000 +//#define DUMP_DATA_INTERVAL 100000 +//#define STAT_PATH "fuzzer.stat_log" +//#define PREV_CONTROL + +#ifdef PREV_CONTROL +#define OPTIONS_AT_COMPILE (ONIG_OPTION_IGNORECASE | ONIG_OPTION_EXTEND | ONIG_OPTION_MULTILINE | ONIG_OPTION_SINGLELINE | ONIG_OPTION_FIND_LONGEST | ONIG_OPTION_FIND_NOT_EMPTY | ONIG_OPTION_NEGATE_SINGLELINE | ONIG_OPTION_DONT_CAPTURE_GROUP | ONIG_OPTION_CAPTURE_GROUP | ONIG_OPTION_WORD_IS_ASCII | ONIG_OPTION_DIGIT_IS_ASCII | ONIG_OPTION_SPACE_IS_ASCII | ONIG_OPTION_POSIX_IS_ASCII | ONIG_OPTION_TEXT_SEGMENT_EXTENDED_GRAPHEME_CLUSTER | ONIG_OPTION_TEXT_SEGMENT_WORD) +#else +#define OPTIONS_AT_COMPILE (ONIG_OPTION_IGNORECASE | ONIG_OPTION_EXTEND | ONIG_OPTION_MULTILINE | ONIG_OPTION_SINGLELINE | ONIG_OPTION_FIND_LONGEST | ONIG_OPTION_FIND_NOT_EMPTY | ONIG_OPTION_NEGATE_SINGLELINE | ONIG_OPTION_DONT_CAPTURE_GROUP | ONIG_OPTION_CAPTURE_GROUP | ONIG_OPTION_WORD_IS_ASCII | ONIG_OPTION_DIGIT_IS_ASCII | ONIG_OPTION_SPACE_IS_ASCII | ONIG_OPTION_POSIX_IS_ASCII | ONIG_OPTION_TEXT_SEGMENT_EXTENDED_GRAPHEME_CLUSTER | ONIG_OPTION_TEXT_SEGMENT_WORD | ONIG_OPTION_IGNORECASE_IS_ASCII) +#endif -#define OPTIONS_AT_RUNTIME (ONIG_OPTION_NOTBOL | ONIG_OPTION_NOTEOL | ONIG_OPTION_CHECK_VALIDITY_OF_STRING | ONIG_OPTION_NOT_BEGIN_STRING | ONIG_OPTION_NOT_END_STRING | ONIG_OPTION_NOT_BEGIN_POSITION) +#define OPTIONS_AT_RUNTIME (ONIG_OPTION_NOTBOL | ONIG_OPTION_NOTEOL | ONIG_OPTION_CHECK_VALIDITY_OF_STRING | ONIG_OPTION_NOT_BEGIN_STRING | ONIG_OPTION_NOT_END_STRING | ONIG_OPTION_NOT_BEGIN_POSITION | ONIG_OPTION_CALLBACK_EACH_MATCH) #define ADJUST_LEN(enc, len) do {\ @@ -38,6 +46,64 @@ typedef unsigned char uint8_t; + +//#define TEST_PATTERN + +#ifdef TEST_PATTERN + +#if 1 +unsigned char TestPattern[] = { +}; +#endif + +#endif /* TEST_PATTERN */ + +#ifdef STANDALONE + +static void +print_options(FILE* fp, OnigOptionType o) +{ + if ((o & ONIG_OPTION_IGNORECASE) != 0) fprintf(fp, " IGNORECASE"); + if ((o & ONIG_OPTION_EXTEND) != 0) fprintf(fp, " EXTEND"); + if ((o & ONIG_OPTION_MULTILINE) != 0) fprintf(fp, " MULTILINE"); + if ((o & ONIG_OPTION_SINGLELINE) != 0) fprintf(fp, " SINGLELINE"); + if ((o & ONIG_OPTION_FIND_LONGEST) != 0) fprintf(fp, " FIND_LONGEST"); + if ((o & ONIG_OPTION_FIND_NOT_EMPTY) != 0) fprintf(fp, " FIND_NOT_EMPTY"); + if ((o & ONIG_OPTION_NEGATE_SINGLELINE) != 0) fprintf(fp, " NEGATE_SINGLELINE"); + if ((o & ONIG_OPTION_DONT_CAPTURE_GROUP) != 0) fprintf(fp, " DONT_CAPTURE_GROUP"); + if ((o & ONIG_OPTION_CAPTURE_GROUP) != 0) fprintf(fp, " CAPTURE_GROUP"); + if ((o & ONIG_OPTION_NOTBOL) != 0) fprintf(fp, " NOTBOL"); + if ((o & ONIG_OPTION_NOTEOL) != 0) fprintf(fp, " NOTEOL"); + if ((o & ONIG_OPTION_POSIX_REGION) != 0) fprintf(fp, " POSIX_REGION"); + if ((o & ONIG_OPTION_CHECK_VALIDITY_OF_STRING) != 0) fprintf(fp, " CHECK_VALIDITY_OF_STRING"); + if ((o & ONIG_OPTION_IGNORECASE_IS_ASCII) != 0) fprintf(fp, " IGNORECASE_IS_ASCII"); + if ((o & ONIG_OPTION_WORD_IS_ASCII) != 0) fprintf(fp, " WORD_IS_ASCII"); + if ((o & ONIG_OPTION_DIGIT_IS_ASCII) != 0) fprintf(fp, " DIGIT_IS_ASCII"); + if ((o & ONIG_OPTION_SPACE_IS_ASCII) != 0) fprintf(fp, " SPACE_IS_ASCII"); + if ((o & ONIG_OPTION_POSIX_IS_ASCII) != 0) fprintf(fp, " POSIX_IS_ASCII"); + if ((o & ONIG_OPTION_TEXT_SEGMENT_EXTENDED_GRAPHEME_CLUSTER) != 0) fprintf(fp, " TEXT_SEGMENT_EXTENDED_GRAPHEME_CLUSTER"); + if ((o & ONIG_OPTION_TEXT_SEGMENT_WORD) != 0) fprintf(fp, " TEXT_SEGMENT_WORD"); + if ((o & ONIG_OPTION_NOT_BEGIN_STRING) != 0) fprintf(fp, " NOT_BIGIN_STRING"); + if ((o & ONIG_OPTION_NOT_END_STRING) != 0) fprintf(fp, " NOT_END_STRING"); + if ((o & ONIG_OPTION_NOT_BEGIN_POSITION) != 0) fprintf(fp, " NOT_BEGIN_POSITION"); + if ((o & ONIG_OPTION_CALLBACK_EACH_MATCH) != 0) fprintf(fp, " CALLBACK_EACH_MATCH"); +} + +static void +to_binary(unsigned int v, char s[/* 33 */]) +{ + unsigned int mask; + int i; + + mask = 1 << (sizeof(v) * 8 - 1); + i = 0; + do { + s[i++] = (mask & v ? '1' : '0'); + } while (mask >>= 1); + s[i] = 0; +} +#endif + #ifdef DUMP_INPUT static void dump_input(unsigned char* data, size_t len) @@ -104,6 +170,7 @@ dump_data(FILE* fp, unsigned char* data, int len) #else +#ifdef EXEC_PRINT_INTERVAL static void output_current_time(FILE* fp) { @@ -115,10 +182,24 @@ output_current_time(FILE* fp) fprintf(fp, "%s", d); } +#endif #endif static int +progress_callout_func(OnigCalloutArgs* args, void* user_data) +{ + return ONIG_CALLOUT_SUCCESS; +} + +static int +each_match_callback_func(const UChar* str, const UChar* end, + const UChar* match_start, OnigRegion* region, void* user_data) +{ + return ONIG_NORMAL; +} + +static int search(regex_t* reg, unsigned char* str, unsigned char* end, OnigOptionType options, int backward, int sl) { int r; @@ -145,7 +226,10 @@ search(regex_t* reg, unsigned char* str, unsigned char* end, OnigOptionType opti onig_set_retry_limit_in_search(retry_limit); onig_set_match_stack_limit_size(MATCH_STACK_LIMIT); - onig_set_subexp_call_limit_in_search(SUBEXP_CALL_LIMIT); + if (sl >= 2) + onig_set_subexp_call_limit_in_search(SLOW_SUBEXP_CALL_LIMIT); + else + onig_set_subexp_call_limit_in_search(SUBEXP_CALL_LIMIT); if (backward != 0) { start = end; @@ -218,10 +302,12 @@ exec(OnigEncoding enc, OnigOptionType options, OnigSyntaxType* syntax, EXEC_COUNT_INTERVAL++; onig_initialize(&enc, 1); + (void)onig_set_progress_callout(progress_callout_func); #ifdef PARSE_DEPTH_LIMIT onig_set_parse_depth_limit(PARSE_DEPTH_LIMIT); #endif - onig_set_subexp_call_max_nest_level(CALL_MAX_NEST_LEVEL); + onig_set_subexp_call_max_nest_level(MAX_SUBEXP_CALL_NEST_LEVEL); + onig_set_callback_each_match(each_match_callback_func); r = onig_new(®, pattern, pattern_end, (options & OPTIONS_AT_COMPILE), enc, syntax, &einfo); @@ -270,18 +356,38 @@ alloc_exec(OnigEncoding enc, OnigOptionType options, OnigSyntaxType* syntax, unsigned char *pattern_end; unsigned char *str_null_end; +#ifdef TEST_PATTERN + pattern = (unsigned char *)malloc(sizeof(TestPattern)); + memcpy(pattern, TestPattern, sizeof(TestPattern)); + pattern_end = pattern + sizeof(TestPattern); +#else pattern = (unsigned char *)malloc(pattern_size != 0 ? pattern_size : 1); memcpy(pattern, data, pattern_size); pattern_end = pattern + pattern_size; +#endif + data += pattern_size; rem_size -= pattern_size; if (rem_size > MAX_REM_SIZE) rem_size = MAX_REM_SIZE; sl = onig_detect_can_be_slow_pattern(pattern, pattern_end, options, enc, syntax); +#ifdef STANDALONE + fprintf(stdout, "sl: %d\n", sl); +#endif if (sl > 0) { - if (rem_size > MAX_SLOW_REM_SIZE) - rem_size = MAX_SLOW_REM_SIZE; + if (sl >= 100) { + if (rem_size > MAX_SLOW_REM_SIZE2) + rem_size = MAX_SLOW_REM_SIZE2; + } + else { + if (rem_size > MAX_SLOW_REM_SIZE) + rem_size = MAX_SLOW_REM_SIZE; + } + } + if (backward != 0 && enc == ONIG_ENCODING_GB18030) { + if (rem_size > MAX_SLOW_BACKWARD_REM_SIZE) + rem_size = MAX_SLOW_BACKWARD_REM_SIZE; } ADJUST_LEN(enc, rem_size); @@ -302,11 +408,19 @@ alloc_exec(OnigEncoding enc, OnigOptionType options, OnigSyntaxType* syntax, return r; } +#ifdef PREV_CONTROL #ifdef SYNTAX_TEST #define NUM_CONTROL_BYTES 7 #else #define NUM_CONTROL_BYTES 6 #endif +#else +#ifdef SYNTAX_TEST +#define NUM_CONTROL_BYTES 8 +#else +#define NUM_CONTROL_BYTES 7 +#endif +#endif int LLVMFuzzerTestOneInput(const uint8_t * Data, size_t Size) { @@ -365,6 +479,7 @@ int LLVMFuzzerTestOneInput(const uint8_t * Data, size_t Size) ONIG_SYNTAX_GNU_REGEX, ONIG_SYNTAX_JAVA, ONIG_SYNTAX_PERL_NG, + ONIG_SYNTAX_PYTHON, ONIG_SYNTAX_ONIGURUMA }; @@ -376,6 +491,7 @@ int LLVMFuzzerTestOneInput(const uint8_t * Data, size_t Size) "GNU Regex", "Java", "Perl+NG", + "Python", "Oniguruma" }; #endif @@ -394,8 +510,10 @@ int LLVMFuzzerTestOneInput(const uint8_t * Data, size_t Size) OnigSyntaxType* syntax; #ifndef STANDALONE +#ifdef EXEC_PRINT_INTERVAL static FILE* STAT_FP; #endif +#endif INPUT_COUNT++; @@ -438,14 +556,22 @@ int LLVMFuzzerTestOneInput(const uint8_t * Data, size_t Size) syntax = ONIG_SYNTAX_DEFAULT; #endif +#ifdef PREV_CONTROL if ((data[2] & 0xc0) == 0) options = data[0] | (data[1] << 8) | (data[2] << 16); +#else + if ((data[3] & 0xc0) == 0) + options = data[0] | (data[1] << 8) | (data[2] << 16) | (data[3] << 24); +#endif else options = data[0] & ONIG_OPTION_IGNORECASE; data++; rem_size--; data++; rem_size--; data++; rem_size--; +#ifndef PREV_CONTROL + data++; rem_size--; +#endif pattern_size_choice = data[0]; data++; rem_size--; @@ -465,18 +591,25 @@ int LLVMFuzzerTestOneInput(const uint8_t * Data, size_t Size) } #ifdef STANDALONE - dump_data(stdout, data, pattern_size); + { + char soptions[33]; + + dump_data(stdout, data, pattern_size); + to_binary(options, soptions); #ifdef SYNTAX_TEST - fprintf(stdout, - "enc: %s, syntax: %s, options: %u, pattern_size: %d, back:%d\n", - ONIGENC_NAME(enc), - syntax_names[syntax_choice % num_syntaxes], - options, - pattern_size, backward); + fprintf(stdout, + "enc: %s, syntax: %s, pattern_size: %d, back:%d\noptions: %s\n", + ONIGENC_NAME(enc), + syntax_names[syntax_choice % num_syntaxes], + pattern_size, backward, soptions); #else - fprintf(stdout, "enc: %s, options: %u, pattern_size: %d, back:%d\n", - ONIGENC_NAME(enc), options, pattern_size, backward); + fprintf(stdout, "enc: %s, pattern_size: %d, back:%d\noptions: %s\n", + ONIGENC_NAME(enc), pattern_size, backward, soptions); #endif + + print_options(stdout, options); + fprintf(stdout, "\n"); + } #endif #ifdef DUMP_INPUT @@ -1,7 +1,7 @@ #!/bin/sh # install - install a program, script, or datafile -scriptversion=2018-03-11.20; # UTC +scriptversion=2020-11-14.01; # UTC # This originates from X11R5 (mit/util/scripts/install.sh), which was # later released in X11R6 (xc/config/util/install.sh) with the @@ -69,6 +69,11 @@ posix_mkdir= # Desired mode of installed file. mode=0755 +# Create dirs (including intermediate dirs) using mode 755. +# This is like GNU 'install' as of coreutils 8.32 (2020). +mkdir_umask=22 + +backupsuffix= chgrpcmd= chmodcmd=$chmodprog chowncmd= @@ -99,18 +104,28 @@ Options: --version display version info and exit. -c (ignored) - -C install only if different (preserve the last data modification time) + -C install only if different (preserve data modification time) -d create directories instead of installing files. -g GROUP $chgrpprog installed files to GROUP. -m MODE $chmodprog installed files to MODE. -o USER $chownprog installed files to USER. + -p pass -p to $cpprog. -s $stripprog installed files. + -S SUFFIX attempt to back up existing files, with suffix SUFFIX. -t DIRECTORY install into DIRECTORY. -T report an error if DSTFILE is a directory. Environment variables override the default commands: CHGRPPROG CHMODPROG CHOWNPROG CMPPROG CPPROG MKDIRPROG MVPROG RMPROG STRIPPROG + +By default, rm is invoked with -f; when overridden with RMPROG, +it's up to you to specify -f if you want it. + +If -S is not specified, no backups are attempted. + +Email bug reports to bug-automake@gnu.org. +Automake home page: https://www.gnu.org/software/automake/ " while test $# -ne 0; do @@ -137,8 +152,13 @@ while test $# -ne 0; do -o) chowncmd="$chownprog $2" shift;; + -p) cpprog="$cpprog -p";; + -s) stripcmd=$stripprog;; + -S) backupsuffix="$2" + shift;; + -t) is_target_a_directory=always dst_arg=$2 @@ -255,6 +275,10 @@ do dstdir=$dst test -d "$dstdir" dstdir_status=$? + # Don't chown directories that already exist. + if test $dstdir_status = 0; then + chowncmd="" + fi else # Waiting for this to be detected by the "$cpprog $src $dsttmp" command @@ -301,22 +325,6 @@ do if test $dstdir_status != 0; then case $posix_mkdir in '') - # Create intermediate dirs using mode 755 as modified by the umask. - # This is like FreeBSD 'install' as of 1997-10-28. - umask=`umask` - case $stripcmd.$umask in - # Optimize common cases. - *[2367][2367]) mkdir_umask=$umask;; - .*0[02][02] | .[02][02] | .[02]) mkdir_umask=22;; - - *[0-7]) - mkdir_umask=`expr $umask + 22 \ - - $umask % 100 % 40 + $umask % 20 \ - - $umask % 10 % 4 + $umask % 2 - `;; - *) mkdir_umask=$umask,go-w;; - esac - # With -d, create the new directory with the user-specified mode. # Otherwise, rely on $mkdir_umask. if test -n "$dir_arg"; then @@ -326,52 +334,49 @@ do fi posix_mkdir=false - case $umask in - *[123567][0-7][0-7]) - # POSIX mkdir -p sets u+wx bits regardless of umask, which - # is incompatible with FreeBSD 'install' when (umask & 300) != 0. - ;; - *) - # Note that $RANDOM variable is not portable (e.g. dash); Use it - # here however when possible just to lower collision chance. - tmpdir=${TMPDIR-/tmp}/ins$RANDOM-$$ - - trap 'ret=$?; rmdir "$tmpdir/a/b" "$tmpdir/a" "$tmpdir" 2>/dev/null; exit $ret' 0 - - # Because "mkdir -p" follows existing symlinks and we likely work - # directly in world-writeable /tmp, make sure that the '$tmpdir' - # directory is successfully created first before we actually test - # 'mkdir -p' feature. - if (umask $mkdir_umask && - $mkdirprog $mkdir_mode "$tmpdir" && - exec $mkdirprog $mkdir_mode -p -- "$tmpdir/a/b") >/dev/null 2>&1 - then - if test -z "$dir_arg" || { - # Check for POSIX incompatibilities with -m. - # HP-UX 11.23 and IRIX 6.5 mkdir -m -p sets group- or - # other-writable bit of parent directory when it shouldn't. - # FreeBSD 6.1 mkdir -m -p sets mode of existing directory. - test_tmpdir="$tmpdir/a" - ls_ld_tmpdir=`ls -ld "$test_tmpdir"` - case $ls_ld_tmpdir in - d????-?r-*) different_mode=700;; - d????-?--*) different_mode=755;; - *) false;; - esac && - $mkdirprog -m$different_mode -p -- "$test_tmpdir" && { - ls_ld_tmpdir_1=`ls -ld "$test_tmpdir"` - test "$ls_ld_tmpdir" = "$ls_ld_tmpdir_1" - } - } - then posix_mkdir=: - fi - rmdir "$tmpdir/a/b" "$tmpdir/a" "$tmpdir" - else - # Remove any dirs left behind by ancient mkdir implementations. - rmdir ./$mkdir_mode ./-p ./-- "$tmpdir" 2>/dev/null - fi - trap '' 0;; - esac;; + # The $RANDOM variable is not portable (e.g., dash). Use it + # here however when possible just to lower collision chance. + tmpdir=${TMPDIR-/tmp}/ins$RANDOM-$$ + + trap ' + ret=$? + rmdir "$tmpdir/a/b" "$tmpdir/a" "$tmpdir" 2>/dev/null + exit $ret + ' 0 + + # Because "mkdir -p" follows existing symlinks and we likely work + # directly in world-writeable /tmp, make sure that the '$tmpdir' + # directory is successfully created first before we actually test + # 'mkdir -p'. + if (umask $mkdir_umask && + $mkdirprog $mkdir_mode "$tmpdir" && + exec $mkdirprog $mkdir_mode -p -- "$tmpdir/a/b") >/dev/null 2>&1 + then + if test -z "$dir_arg" || { + # Check for POSIX incompatibilities with -m. + # HP-UX 11.23 and IRIX 6.5 mkdir -m -p sets group- or + # other-writable bit of parent directory when it shouldn't. + # FreeBSD 6.1 mkdir -m -p sets mode of existing directory. + test_tmpdir="$tmpdir/a" + ls_ld_tmpdir=`ls -ld "$test_tmpdir"` + case $ls_ld_tmpdir in + d????-?r-*) different_mode=700;; + d????-?--*) different_mode=755;; + *) false;; + esac && + $mkdirprog -m$different_mode -p -- "$test_tmpdir" && { + ls_ld_tmpdir_1=`ls -ld "$test_tmpdir"` + test "$ls_ld_tmpdir" = "$ls_ld_tmpdir_1" + } + } + then posix_mkdir=: + fi + rmdir "$tmpdir/a/b" "$tmpdir/a" "$tmpdir" + else + # Remove any dirs left behind by ancient mkdir implementations. + rmdir ./$mkdir_mode ./-p ./-- "$tmpdir" 2>/dev/null + fi + trap '' 0;; esac if @@ -382,7 +387,7 @@ do then : else - # The umask is ridiculous, or mkdir does not conform to POSIX, + # mkdir does not conform to POSIX, # or it failed possibly due to a race condition. Create the # directory the slow way, step by step, checking for races as we go. @@ -411,7 +416,7 @@ do prefixes= else if $posix_mkdir; then - (umask=$mkdir_umask && + (umask $mkdir_umask && $doit_exec $mkdirprog $mkdir_mode -p -- "$dstdir") && break # Don't fail if two instances are running concurrently. test -d "$prefix" || exit 1 @@ -451,7 +456,18 @@ do trap 'ret=$?; rm -f "$dsttmp" "$rmtmp" && exit $ret' 0 # Copy the file name to the temp name. - (umask $cp_umask && $doit_exec $cpprog "$src" "$dsttmp") && + (umask $cp_umask && + { test -z "$stripcmd" || { + # Create $dsttmp read-write so that cp doesn't create it read-only, + # which would cause strip to fail. + if test -z "$doit"; then + : >"$dsttmp" # No need to fork-exec 'touch'. + else + $doit touch "$dsttmp" + fi + } + } && + $doit_exec $cpprog "$src" "$dsttmp") && # and set any options; do chmod last to preserve setuid bits. # @@ -477,6 +493,13 @@ do then rm -f "$dsttmp" else + # If $backupsuffix is set, and the file being installed + # already exists, attempt a backup. Don't worry if it fails, + # e.g., if mv doesn't support -f. + if test -n "$backupsuffix" && test -f "$dst"; then + $doit $mvcmd -f "$dst" "$dst$backupsuffix" 2>/dev/null + fi + # Rename the file to the real destination. $doit $mvcmd -f "$dsttmp" "$dst" 2>/dev/null || @@ -491,9 +514,9 @@ do # file should still install successfully. { test ! -f "$dst" || - $doit $rmcmd -f "$dst" 2>/dev/null || + $doit $rmcmd "$dst" 2>/dev/null || { $doit $mvcmd -f "$dst" "$rmtmp" 2>/dev/null && - { $doit $rmcmd -f "$rmtmp" 2>/dev/null; :; } + { $doit $rmcmd "$rmtmp" 2>/dev/null; :; } } || { echo "$0: cannot unlink or rename $dst" >&2 (exit 1); exit 1 @@ -3,7 +3,7 @@ scriptversion=2018-03-07.03; # UTC -# Copyright (C) 1996-2018 Free Software Foundation, Inc. +# Copyright (C) 1996-2020 Free Software Foundation, Inc. # Originally written by Fran,cois Pinard <pinard@iro.umontreal.ca>, 1996. # This program is free software; you can redistribute it and/or modify diff --git a/sample/Makefile.am b/sample/Makefile.am index c2c4596..681cd2a 100644 --- a/sample/Makefile.am +++ b/sample/Makefile.am @@ -4,13 +4,13 @@ lib_onig = ../src/libonig.la LDADD = $(lib_onig) AM_CFLAGS = -Wall -AM_LDFLAGS = -L$(prefix)/lib +AM_LDFLAGS = -L$(libdir) AM_CPPFLAGS = -I$(top_srcdir)/src if ENABLE_POSIX_API -TESTS = encode listcap names posix simple sql syntax user_property callout echo count bug_fix regset scan +TESTS = encode listcap names posix simple sql syntax user_property callout echo count bug_fix regset scan callback_each_match else -TESTS = encode listcap names simple sql syntax user_property callout echo count bug_fix regset scan +TESTS = encode listcap names simple sql syntax user_property callout echo count bug_fix regset scan callback_each_match endif check_PROGRAMS = $(TESTS) @@ -29,6 +29,7 @@ count_SOURCES = count.c bug_fix = bug_fix.c regset_SOURCES = regset.c scan_SOURCES = scan.c +callback_each_match_SOURCES = callback_each_match.c sampledir = . @@ -49,3 +50,4 @@ endif $(sampledir)/bug_fix $(sampledir)/regset $(sampledir)/scan + $(sampledir)/callback_each_match diff --git a/sample/callback_each_match.c b/sample/callback_each_match.c new file mode 100644 index 0000000..10ed56d --- /dev/null +++ b/sample/callback_each_match.c @@ -0,0 +1,168 @@ +/* + * callback_each_match.c + */ +#include <stdio.h> +#include <string.h> +#include "oniguruma.h" + +static int +each_match_callback(const UChar* str, const UChar* end, + const UChar* match_start, OnigRegion* region, void* user_data) +{ +#if 1 + fprintf(stdout, "each_match_callback:\n"); + fprintf(stdout, " match at: %ld - %d: %p\n", match_start - str, region->end[0], + user_data); + fprintf(stdout, " region[0]: %d - %d\n", region->beg[0], region->end[0]); +#else + int i; + i = region->beg[0]; + fputc('<', stdout); + while (i < region->end[0]) { + fputc((int )str[i], stdout); + i++; + } + fputc('>', stdout); +#endif + +#if 0 + /* terminate match/search if returns error code < 0 */ + return ONIG_ABORT; +#endif + + return ONIG_NORMAL; +} + +static int +search(UChar* pattern, UChar* str, OnigOptionType options, OnigOptionType runtime_options) +{ + int r; + unsigned char *start, *range, *end; + regex_t* reg; + OnigErrorInfo einfo; + OnigRegion *region; + OnigMatchParam* mp; + void* user_data; + + r = onig_new(®, pattern, pattern + strlen((char* )pattern), + options, ONIG_ENCODING_ASCII, ONIG_SYNTAX_DEFAULT, &einfo); + if (r != ONIG_NORMAL) { + char s[ONIG_MAX_ERROR_MESSAGE_LEN]; + onig_error_code_to_str((UChar* )s, r, &einfo); + fprintf(stderr, "ERROR: %s\n", s); + return -1; + } + + region = onig_region_new(); + + end = str + strlen((char* )str); + start = str; + range = end; + mp = onig_new_match_param(); + if (mp == 0) return -2; + + user_data = (void* )0x1234; + onig_set_callout_user_data_of_match_param(mp, user_data); + + r = onig_search_with_param(reg, str, end, start, range, region, + runtime_options, mp); + onig_free_match_param(mp); + if (r >= 0) { + /* If ONIG_OPTION_CALLBACK_EACH_MATCH is used with + ONIG_OPTION_FIND_LONGEST, it may also return positive value. */ + fprintf(stdout, "\nr: %d\n", r); + } + else if (r == ONIG_MISMATCH) { + /* always return ONIG_MISMATCH if ONIG_OPTION_CALLBACK_EACH_MATCH */ + fprintf(stdout, "\n"); + } + else { /* error */ + char s[ONIG_MAX_ERROR_MESSAGE_LEN]; + onig_error_code_to_str((UChar* )s, r); + fprintf(stderr, "ERROR: %s\n", s); + onig_region_free(region, 1 /* 1:free self, 0:free contents only */); + onig_free(reg); + onig_end(); + return -1; + } + + return 0; +} + +static int +match(UChar* pattern, UChar* str, UChar* at, OnigOptionType options, OnigOptionType runtime_options) +{ + int r; + unsigned char *start, *range, *end; + regex_t* reg; + OnigErrorInfo einfo; + OnigRegion *region; + OnigMatchParam* mp; + void* user_data; + + r = onig_new(®, pattern, pattern + strlen((char* )pattern), + options, ONIG_ENCODING_ASCII, ONIG_SYNTAX_DEFAULT, &einfo); + if (r != ONIG_NORMAL) { + char s[ONIG_MAX_ERROR_MESSAGE_LEN]; + onig_error_code_to_str((UChar* )s, r, &einfo); + fprintf(stderr, "ERROR: %s\n", s); + return -1; + } + + region = onig_region_new(); + + end = str + strlen((char* )str); + start = str; + range = end; + mp = onig_new_match_param(); + if (mp == 0) return -2; + + user_data = (void* )0x1234; + onig_set_callout_user_data_of_match_param(mp, user_data); + + r = onig_match_with_param(reg, str, end, at, region, runtime_options, mp); + onig_free_match_param(mp); + if (r >= 0) { + /* If ONIG_OPTION_CALLBACK_EACH_MATCH is used with + ONIG_OPTION_FIND_LONGEST, it may also return positive value. */ + fprintf(stdout, "\nr: %d\n", r); + } + else if (r == ONIG_MISMATCH) { + /* always return ONIG_MISMATCH if ONIG_OPTION_CALLBACK_EACH_MATCH */ + fprintf(stdout, "\n"); + } + else { /* error */ + char s[ONIG_MAX_ERROR_MESSAGE_LEN]; + onig_error_code_to_str((UChar* )s, r); + fprintf(stderr, "ERROR: %s\n", s); + onig_region_free(region, 1 /* 1:free self, 0:free contents only */); + onig_free(reg); + onig_end(); + return -1; + } + + return 0; +} + +extern int main(int argc, char* argv[]) +{ + OnigEncoding use_encs[1]; + + static UChar* pattern = (UChar* )"a(.*)\\Kb|[e-f]+"; + static UChar* str = (UChar* )"zzzzafffb"; + + use_encs[0] = ONIG_ENCODING_ASCII; + onig_initialize(use_encs, sizeof(use_encs)/sizeof(use_encs[0])); + onig_set_callback_each_match(each_match_callback); + + fprintf(stdout, "<search>\n"); + search(pattern, str, ONIG_OPTION_NONE, ONIG_OPTION_CALLBACK_EACH_MATCH); + fprintf(stdout, "<search with FIND_LONGEST>\n"); + search(pattern, str, ONIG_OPTION_FIND_LONGEST, ONIG_OPTION_CALLBACK_EACH_MATCH); + + fprintf(stdout, "<match>\n"); + match(pattern, str, str + 5, ONIG_OPTION_NONE, ONIG_OPTION_CALLBACK_EACH_MATCH); + + onig_end(); + return 0; +} diff --git a/sample/count.c b/sample/count.c index 2b67db7..904101c 100644 --- a/sample/count.c +++ b/sample/count.c @@ -59,10 +59,10 @@ test(OnigEncoding enc, OnigMatchParam* mp, char* in_pattern, char* in_str) tag_len = ulen(enc, tag); slot = 0; - r = onig_get_callout_data_by_tag(reg, mp, (UChar* )tag, (UChar* )tag + tag_len, - slot, 0, &val); + r = onig_get_callout_data_by_tag_dont_clear_old(reg, mp, (UChar* )tag, + (UChar* )tag + tag_len, slot, 0, &val); if (r < ONIG_NORMAL) goto err; - else if (r > ONIG_NORMAL) { + else if (r == ONIG_VALUE_IS_NOT_SET) { fprintf(stdout, "COUNT[x]: NO DATA\n"); } else { diff --git a/sample/listcap.c b/sample/listcap.c index c0d3014..8072842 100644 --- a/sample/listcap.c +++ b/sample/listcap.c @@ -103,7 +103,7 @@ extern int main(int argc, char* argv[]) use_encs[0] = ONIG_ENCODING_ASCII; onig_initialize(use_encs, sizeof(use_encs)/sizeof(use_encs[0])); - /* enable capture hostory */ + /* enable capture history */ onig_copy_syntax(&syn, ONIG_SYNTAX_DEFAULT); onig_set_syntax_op2(&syn, onig_get_syntax_op2(&syn) | ONIG_SYN_OP2_ATMARK_CAPTURE_HISTORY); diff --git a/src/Makefile.windows b/src/Makefile.windows index 11d6fd8..b637772 100644 --- a/src/Makefile.windows +++ b/src/Makefile.windows @@ -2,8 +2,9 @@ product_name = oniguruma -TEST_DIR = $(ONIG_DIR)/../test -WIN_DIR = $(ONIG_DIR)/../windows +TEST_DIR = $(ONIG_DIR)/../test +SAMPLE_DIR = $(ONIG_DIR)/../sample +WIN_DIR = $(ONIG_DIR)/../windows CPPFLAGS = CFLAGS = -O2 -nologo /W3 @@ -15,6 +16,8 @@ ARDLL = cl ARDLL_FLAGS = -nologo -LD $(LINKFLAGS) -dll LINKFLAGS = -link -incremental:no -pdb:none +SAMPLE_CFLAGS = $(CFLAGS) /I$(ONIG_DIR) + INSTALL = install -c CP = copy CC = cl @@ -89,11 +92,6 @@ makeargs = $(MFLAGS) CPPFLAGS='$(CPPFLAGS)' CFLAGS='$(CFLAGS)' CC='$(CC)' # targets default: all -setup: - $(CP) ..\win32\config.h config.h - $(CP) ..\win32\testc.c testc.c - - all: $(libname) $(dllname) $(libname): $(libobjs) $(encobjs) @@ -155,7 +153,7 @@ $(BUILD_DIR)/unicode_fold1_key.obj: $(ONIG_DIR)/unicode_fold1_key.c $(ONIG_DIR)/ $(BUILD_DIR)/unicode_fold2_key.obj: $(ONIG_DIR)/unicode_fold2_key.c $(ONIG_DIR)/regenc.h $(BUILD_DIR)/config.h $(BUILD_DIR)/unicode_fold3_key.obj: $(ONIG_DIR)/unicode_fold3_key.c $(ONIG_DIR)/regenc.h $(BUILD_DIR)/config.h -all-test: test_syntax test_regset test_utf8 testc testp testu +all-test: test_syntax test_regset test_utf8 test_options test_back testc testp testu test_syntax: $(TEST_DIR)/test_syntax.c $(libname) $(CC) -nologo /Fe:$@ /I. /I$(ONIG_DIR) /DONIG_EXTERN=extern /utf-8 $(TEST_DIR)/test_syntax.c $(libname) @@ -166,6 +164,12 @@ test_regset: $(TEST_DIR)/test_regset.c $(libname) test_utf8: $(TEST_DIR)/test_utf8.c $(libname) $(CC) -nologo /Fe:$@ /I. /I$(ONIG_DIR) /DONIG_EXTERN=extern /utf-8 $(TEST_DIR)/test_utf8.c $(libname) +test_options: $(TEST_DIR)/test_options.c $(libname) + $(CC) -nologo /Fe:$@ /I. /I$(ONIG_DIR) /DONIG_EXTERN=extern /utf-8 $(TEST_DIR)/test_options.c $(libname) + +test_back: $(TEST_DIR)/test_back.c $(libname) + $(CC) -nologo /Fe:$@ /I. /I$(ONIG_DIR) /DONIG_EXTERN=extern /utf-8 $(TEST_DIR)/test_back.c $(libname) + testc: $(WIN_DIR)/testc.c $(libname) $(CC) -nologo /Fe:$@ /I. /I$(ONIG_DIR) /DONIG_EXTERN=extern $(WIN_DIR)/testc.c $(libname) @@ -176,14 +180,17 @@ testu: $(TEST_DIR)/testu.c $(libname) $(CC) -nologo /Fe:$@ /I. /I$(ONIG_DIR) /DONIG_EXTERN=extern $(TEST_DIR)/testu.c $(libname) clean: - del $(BUILD_DIR)\*.obj $(BUILD_DIR)\*.lib $(BUILD_DIR)\*.exp $(BUILD_DIR)\*.dll $(BUILD_DIR)\test_regset.exe $(BUILD_DIR)\test_syntax.exe $(BUILD_DIR)\test_utf8.exe $(BUILD_DIR)\testp.exe $(BUILD_DIR)\testc.exe $(BUILD_DIR)\testu.exe + del $(BUILD_DIR)\*.obj $(BUILD_DIR)\*.lib $(BUILD_DIR)\*.exp $(BUILD_DIR)\*.dll $(BUILD_DIR)\test_regset.exe $(BUILD_DIR)\test_syntax.exe $(BUILD_DIR)\test_utf8.exe $(BUILD_DIR)\test_options.exe $(BUILD_DIR)\test_back.exe $(BUILD_DIR)\testp.exe $(BUILD_DIR)\testc.exe $(BUILD_DIR)\testu.exe samples: all - $(CC) $(CFLAGS) -I. /Fe:simple $(ONIG_DIR)\sample\simple.c $(dlllib) - $(CC) $(CFLAGS) -I. /Fe:posix $(ONIG_DIR)\sample\posix.c $(dlllib) - $(CC) $(CFLAGS) -I. /Fe:names $(ONIG_DIR)\sample\names.c $(dlllib) - $(CC) $(CFLAGS) -I. /Fe:listcap $(ONIG_DIR)\sample\listcap.c $(dlllib) - $(CC) $(CFLAGS) -I. /Fe:sql $(ONIG_DIR)\sample\sql.c $(dlllib) - $(CC) $(CFLAGS) -I. /Fe:encode $(ONIG_DIR)\sample\encode.c $(dlllib) - $(CC) $(CFLAGS) -I. /Fe:syntax $(ONIG_DIR)\sample\syntax.c $(dlllib) + $(CC) $(SAMPLE_CFLAGS) /Fe:simple $(SAMPLE_DIR)\simple.c $(dlllib) + $(CC) $(SAMPLE_CFLAGS) /Fe:posix $(SAMPLE_DIR)\posix.c $(dlllib) + $(CC) $(SAMPLE_CFLAGS) /Fe:names $(SAMPLE_DIR)\names.c $(dlllib) + $(CC) $(SAMPLE_CFLAGS) /Fe:listcap $(SAMPLE_DIR)\listcap.c $(dlllib) + $(CC) $(SAMPLE_CFLAGS) /Fe:sql $(SAMPLE_DIR)\sql.c $(dlllib) + $(CC) $(SAMPLE_CFLAGS) /Fe:encode $(SAMPLE_DIR)\encode.c $(dlllib) + $(CC) $(SAMPLE_CFLAGS) /Fe:syntax $(SAMPLE_DIR)\syntax.c $(dlllib) + $(CC) $(SAMPLE_CFLAGS) /Fe:count $(SAMPLE_DIR)\count.c $(dlllib) + $(CC) $(SAMPLE_CFLAGS) /Fe:regset $(SAMPLE_DIR)\regset.c $(dlllib) + $(CC) $(SAMPLE_CFLAGS) /Fe:callback_each_match $(SAMPLE_DIR)\callback_each_match.c $(dlllib) diff --git a/src/cp1251.c b/src/cp1251.c index fa20780..36b36f6 100644 --- a/src/cp1251.c +++ b/src/cp1251.c @@ -2,7 +2,7 @@ cp1251.c - Oniguruma (regular expression library) **********************************************************************/ /*- - * Copyright (c) 2006-2019 Byte <byte AT mail DOT kna DOT ru> + * Copyright (c) 2006-2020 Byte <byte AT mail DOT kna DOT ru> * K.Kosako * All rights reserved. * @@ -105,12 +105,16 @@ static const unsigned short EncCP1251_CtypeTable[256] = { }; static int -cp1251_mbc_case_fold(OnigCaseFoldType flag ARG_UNUSED, +cp1251_mbc_case_fold(OnigCaseFoldType flag, const UChar** pp, const UChar* end ARG_UNUSED, UChar* lower) { const UChar* p = *pp; - *lower = ENC_CP1251_TO_LOWER_CASE(*p); + if (CASE_FOLD_IS_NOT_ASCII_ONLY(flag) || ONIGENC_IS_ASCII_CODE(*p)) + *lower = ENC_CP1251_TO_LOWER_CASE(*p); + else + *lower = *p; + (*pp)++; return 1; } diff --git a/src/gb18030.c b/src/gb18030.c index 7409d3e..1da19b4 100644 --- a/src/gb18030.c +++ b/src/gb18030.c @@ -30,9 +30,11 @@ #include "regenc.h" -#if 1 +/* #define DEBUG_GB18030 */ -#define DEBUG_GB18030(arg) +#ifndef DEBUG_GB18030 + +#define DEBUG_OUT(arg) #else @@ -43,7 +45,7 @@ /* for printf() */ #include "regint.h" -#define DEBUG_GB18030(arg) printf arg +#define DEBUG_OUT(arg) printf arg #endif @@ -177,8 +179,8 @@ gb18030_is_code_ctype(OnigCodePoint code, unsigned int ctype) } enum state { - S_START, - S_one_C2, + S_START = 0, + S_one_C2 = 1, S_one_C4, S_one_CM, @@ -210,15 +212,43 @@ enum state { S_odd_CM_even_C4CM, }; +#ifdef DEBUG_GB18030 +static char* StateNames[] = { + "S_START", + "S_one_C2", + "S_one_C4", + "S_one_CM", + "S_odd_CM_one_CX", + "S_even_CM_one_CX", + "S_one_CMC4", + "S_odd_CMC4", + "S_one_C4_odd_CMC4", + "S_even_CMC4", + "S_one_C4_even_CMC4", + "S_odd_CM_odd_CMC4", + "S_even_CM_odd_CMC4", + "S_odd_CM_even_CMC4", + "S_even_CM_even_CMC4", + "S_odd_C4CM", + "S_one_CM_odd_C4CM", + "S_even_C4CM", + "S_one_CM_even_C4CM", + "S_even_CM_odd_C4CM", + "S_odd_CM_odd_C4CM", + "S_even_CM_even_C4CM", + "S_odd_CM_even_C4CM" +}; +#endif + static UChar* gb18030_left_adjust_char_head(const UChar* start, const UChar* s) { const UChar *p; enum state state = S_START; - DEBUG_GB18030(("----------------\n")); + DEBUG_OUT(("----------------\n")); for (p = s; p >= start; p--) { - DEBUG_GB18030(("state %d --(%02x)-->\n", state, *p)); + DEBUG_OUT(("%5d: state %-19s (0x%02x)->\n", (int )(p - start), StateNames[state], *p)); switch (state) { case S_START: switch (GB18030_MAP[*p]) { @@ -499,7 +529,7 @@ gb18030_left_adjust_char_head(const UChar* start, const UChar* s) } } - DEBUG_GB18030(("state %d\n", state)); + DEBUG_OUT(("state %-19s\n", StateNames[state])); switch (state) { case S_START: return (UChar *)(s - 0); case S_one_C2: return (UChar *)(s - 0); diff --git a/src/iso8859_1.c b/src/iso8859_1.c index d75509e..2013e75 100644 --- a/src/iso8859_1.c +++ b/src/iso8859_1.c @@ -2,7 +2,7 @@ iso8859_1.c - Oniguruma (regular expression library) **********************************************************************/ /*- - * Copyright (c) 2002-2019 K.Kosako + * Copyright (c) 2002-2020 K.Kosako * All rights reserved. * * Redistribution and use in source and binary forms, with or without @@ -114,7 +114,7 @@ apply_all_case_fold(OnigCaseFoldType flag, } static int -get_case_fold_codes_by_str(OnigCaseFoldType flag ARG_UNUSED, +get_case_fold_codes_by_str(OnigCaseFoldType flag, const OnigUChar* p, const OnigUChar* end, OnigCaseFoldCodeItem items[]) { @@ -123,7 +123,8 @@ get_case_fold_codes_by_str(OnigCaseFoldType flag ARG_UNUSED, if (0x41 <= *p && *p <= 0x5a) { if (*p == LARGE_S && end > p + 1 - && (*(p+1) == LARGE_S || *(p+1) == SMALL_S)) { /* SS */ + && (*(p+1) == LARGE_S || *(p+1) == SMALL_S) + && CASE_FOLD_IS_NOT_ASCII_ONLY(flag)) { /* SS */ ss_combination: items[0].byte_len = 2; items[0].code_len = 1; @@ -152,7 +153,8 @@ get_case_fold_codes_by_str(OnigCaseFoldType flag ARG_UNUSED, } else if (0x61 <= *p && *p <= 0x7a) { if (*p == SMALL_S && end > p + 1 - && (*(p+1) == SMALL_S || *(p+1) == LARGE_S)) { /* ss */ + && (*(p+1) == SMALL_S || *(p+1) == LARGE_S) + && CASE_FOLD_IS_NOT_ASCII_ONLY(flag)) { /* ss */ goto ss_combination; } @@ -161,56 +163,58 @@ get_case_fold_codes_by_str(OnigCaseFoldType flag ARG_UNUSED, items[0].code[0] = (OnigCodePoint )(*p - 0x20); return 1; } - else if (0xc0 <= *p && *p <= 0xcf) { - items[0].byte_len = 1; - items[0].code_len = 1; - items[0].code[0] = (OnigCodePoint )(*p + 0x20); - return 1; - } - else if (0xd0 <= *p && *p <= 0xdf) { - if (*p == 0xdf) { + else if (CASE_FOLD_IS_NOT_ASCII_ONLY(flag)) { + if (0xc0 <= *p && *p <= 0xcf) { items[0].byte_len = 1; - items[0].code_len = 2; - items[0].code[0] = (OnigCodePoint )'s'; - items[0].code[1] = (OnigCodePoint )'s'; + items[0].code_len = 1; + items[0].code[0] = (OnigCodePoint )(*p + 0x20); + return 1; + } + else if (0xd0 <= *p && *p <= 0xdf) { + if (*p == 0xdf) { + items[0].byte_len = 1; + items[0].code_len = 2; + items[0].code[0] = (OnigCodePoint )'s'; + items[0].code[1] = (OnigCodePoint )'s'; - items[1].byte_len = 1; - items[1].code_len = 2; - items[1].code[0] = (OnigCodePoint )'S'; - items[1].code[1] = (OnigCodePoint )'S'; + items[1].byte_len = 1; + items[1].code_len = 2; + items[1].code[0] = (OnigCodePoint )'S'; + items[1].code[1] = (OnigCodePoint )'S'; - items[2].byte_len = 1; - items[2].code_len = 2; - items[2].code[0] = (OnigCodePoint )'s'; - items[2].code[1] = (OnigCodePoint )'S'; + items[2].byte_len = 1; + items[2].code_len = 2; + items[2].code[0] = (OnigCodePoint )'s'; + items[2].code[1] = (OnigCodePoint )'S'; - items[3].byte_len = 1; - items[3].code_len = 2; - items[3].code[0] = (OnigCodePoint )'S'; - items[3].code[1] = (OnigCodePoint )'s'; + items[3].byte_len = 1; + items[3].code_len = 2; + items[3].code[0] = (OnigCodePoint )'S'; + items[3].code[1] = (OnigCodePoint )'s'; - return 4; - } - else if (*p != 0xd7) { - items[0].byte_len = 1; - items[0].code_len = 1; - items[0].code[0] = (OnigCodePoint )(*p + 0x20); - return 1; + return 4; + } + else if (*p != 0xd7) { + items[0].byte_len = 1; + items[0].code_len = 1; + items[0].code[0] = (OnigCodePoint )(*p + 0x20); + return 1; + } } - } - else if (0xe0 <= *p && *p <= 0xef) { - items[0].byte_len = 1; - items[0].code_len = 1; - items[0].code[0] = (OnigCodePoint )(*p - 0x20); - return 1; - } - else if (0xf0 <= *p && *p <= 0xfe) { - if (*p != 0xf7) { + else if (0xe0 <= *p && *p <= 0xef) { items[0].byte_len = 1; items[0].code_len = 1; items[0].code[0] = (OnigCodePoint )(*p - 0x20); return 1; } + else if (0xf0 <= *p && *p <= 0xfe) { + if (*p != 0xf7) { + items[0].byte_len = 1; + items[0].code_len = 1; + items[0].code[0] = (OnigCodePoint )(*p - 0x20); + return 1; + } + } } return 0; @@ -229,7 +233,11 @@ mbc_case_fold(OnigCaseFoldType flag, const UChar** pp, return 2; } - *lower = ONIGENC_ISO_8859_1_TO_LOWER_CASE(*p); + if (CASE_FOLD_IS_NOT_ASCII_ONLY(flag) || ONIGENC_IS_ASCII_CODE(*p)) + *lower = ONIGENC_ISO_8859_1_TO_LOWER_CASE(*p); + else + *lower = *p; + (*pp)++; return 1; } diff --git a/src/iso8859_10.c b/src/iso8859_10.c index e98cffb..e4bf599 100644 --- a/src/iso8859_10.c +++ b/src/iso8859_10.c @@ -2,7 +2,7 @@ iso8859_10.c - Oniguruma (regular expression library) **********************************************************************/ /*- - * Copyright (c) 2002-2019 K.Kosako + * Copyright (c) 2002-2020 K.Kosako * All rights reserved. * * Redistribution and use in source and binary forms, with or without @@ -116,7 +116,11 @@ mbc_case_fold(OnigCaseFoldType flag, return 2; } - *lower = ENC_ISO_8859_10_TO_LOWER_CASE(*p); + if (CASE_FOLD_IS_NOT_ASCII_ONLY(flag) || ONIGENC_IS_ASCII_CODE(*p)) + *lower = ENC_ISO_8859_10_TO_LOWER_CASE(*p); + else + *lower = *p; + (*pp)++; return 1; } diff --git a/src/iso8859_13.c b/src/iso8859_13.c index 2bd460f..dbf747f 100644 --- a/src/iso8859_13.c +++ b/src/iso8859_13.c @@ -2,7 +2,7 @@ iso8859_13.c - Oniguruma (regular expression library) **********************************************************************/ /*- - * Copyright (c) 2002-2019 K.Kosako + * Copyright (c) 2002-2020 K.Kosako * All rights reserved. * * Redistribution and use in source and binary forms, with or without @@ -116,7 +116,11 @@ mbc_case_fold(OnigCaseFoldType flag, return 2; } - *lower = ENC_ISO_8859_13_TO_LOWER_CASE(*p); + if (CASE_FOLD_IS_NOT_ASCII_ONLY(flag) || ONIGENC_IS_ASCII_CODE(*p)) + *lower = ENC_ISO_8859_13_TO_LOWER_CASE(*p); + else + *lower = *p; + (*pp)++; return 1; } diff --git a/src/iso8859_14.c b/src/iso8859_14.c index 5030b55..a6d6b71 100644 --- a/src/iso8859_14.c +++ b/src/iso8859_14.c @@ -2,7 +2,7 @@ iso8859_14.c - Oniguruma (regular expression library) **********************************************************************/ /*- - * Copyright (c) 2002-2019 K.Kosako + * Copyright (c) 2002-2020 K.Kosako * All rights reserved. * * Redistribution and use in source and binary forms, with or without @@ -116,7 +116,11 @@ mbc_case_fold(OnigCaseFoldType flag, return 2; } - *lower = ENC_ISO_8859_14_TO_LOWER_CASE(*p); + if (CASE_FOLD_IS_NOT_ASCII_ONLY(flag) || ONIGENC_IS_ASCII_CODE(*p)) + *lower = ENC_ISO_8859_14_TO_LOWER_CASE(*p); + else + *lower = *p; + (*pp)++; return 1; /* return byte length of converted char to lower */ } diff --git a/src/iso8859_15.c b/src/iso8859_15.c index f32c3de..0bb6b12 100644 --- a/src/iso8859_15.c +++ b/src/iso8859_15.c @@ -2,7 +2,7 @@ iso8859_15.c - Oniguruma (regular expression library) **********************************************************************/ /*- - * Copyright (c) 2002-2019 K.Kosako + * Copyright (c) 2002-2020 K.Kosako * All rights reserved. * * Redistribution and use in source and binary forms, with or without @@ -116,7 +116,11 @@ mbc_case_fold(OnigCaseFoldType flag, return 2; } - *lower = ENC_ISO_8859_15_TO_LOWER_CASE(*p); + if (CASE_FOLD_IS_NOT_ASCII_ONLY(flag) || ONIGENC_IS_ASCII_CODE(*p)) + *lower = ENC_ISO_8859_15_TO_LOWER_CASE(*p); + else + *lower = *p; + (*pp)++; return 1; /* return byte length of converted char to lower */ } diff --git a/src/iso8859_16.c b/src/iso8859_16.c index 22a653a..bfd0a5b 100644 --- a/src/iso8859_16.c +++ b/src/iso8859_16.c @@ -2,7 +2,7 @@ iso8859_16.c - Oniguruma (regular expression library) **********************************************************************/ /*- - * Copyright (c) 2002-2019 K.Kosako + * Copyright (c) 2002-2020 K.Kosako * All rights reserved. * * Redistribution and use in source and binary forms, with or without @@ -116,7 +116,11 @@ mbc_case_fold(OnigCaseFoldType flag, return 2; } - *lower = ENC_ISO_8859_16_TO_LOWER_CASE(*p); + if (CASE_FOLD_IS_NOT_ASCII_ONLY(flag) || ONIGENC_IS_ASCII_CODE(*p)) + *lower = ENC_ISO_8859_16_TO_LOWER_CASE(*p); + else + *lower = *p; + (*pp)++; return 1; /* return byte length of converted char to lower */ } diff --git a/src/iso8859_2.c b/src/iso8859_2.c index dc3d0a1..d08140e 100644 --- a/src/iso8859_2.c +++ b/src/iso8859_2.c @@ -2,7 +2,7 @@ iso8859_2.c - Oniguruma (regular expression library) **********************************************************************/ /*- - * Copyright (c) 2002-2019 K.Kosako + * Copyright (c) 2002-2020 K.Kosako * All rights reserved. * * Redistribution and use in source and binary forms, with or without @@ -116,7 +116,11 @@ mbc_case_fold(OnigCaseFoldType flag, return 2; } - *lower = ENC_ISO_8859_2_TO_LOWER_CASE(*p); + if (CASE_FOLD_IS_NOT_ASCII_ONLY(flag) || ONIGENC_IS_ASCII_CODE(*p)) + *lower = ENC_ISO_8859_2_TO_LOWER_CASE(*p); + else + *lower = *p; + (*pp)++; return 1; /* return byte length of converted char to lower */ } diff --git a/src/iso8859_3.c b/src/iso8859_3.c index 49dc6b2..69b96fd 100644 --- a/src/iso8859_3.c +++ b/src/iso8859_3.c @@ -2,7 +2,7 @@ iso8859_3.c - Oniguruma (regular expression library) **********************************************************************/ /*- - * Copyright (c) 2002-2019 K.Kosako + * Copyright (c) 2002-2020 K.Kosako * All rights reserved. * * Redistribution and use in source and binary forms, with or without @@ -116,7 +116,11 @@ mbc_case_fold(OnigCaseFoldType flag, const UChar** pp, return 2; } - *lower = ENC_ISO_8859_3_TO_LOWER_CASE(*p); + if (CASE_FOLD_IS_NOT_ASCII_ONLY(flag) || ONIGENC_IS_ASCII_CODE(*p)) + *lower = ENC_ISO_8859_3_TO_LOWER_CASE(*p); + else + *lower = *p; + (*pp)++; return 1; } diff --git a/src/iso8859_4.c b/src/iso8859_4.c index f3f6ba9..949b7a1 100644 --- a/src/iso8859_4.c +++ b/src/iso8859_4.c @@ -2,7 +2,7 @@ iso8859_4.c - Oniguruma (regular expression library) **********************************************************************/ /*- - * Copyright (c) 2002-2019 K.Kosako + * Copyright (c) 2002-2020 K.Kosako * All rights reserved. * * Redistribution and use in source and binary forms, with or without @@ -116,7 +116,11 @@ mbc_case_fold(OnigCaseFoldType flag, return 2; } - *lower = ENC_ISO_8859_4_TO_LOWER_CASE(*p); + if (CASE_FOLD_IS_NOT_ASCII_ONLY(flag) || ONIGENC_IS_ASCII_CODE(*p)) + *lower = ENC_ISO_8859_4_TO_LOWER_CASE(*p); + else + *lower = *p; + (*pp)++; return 1; /* return byte length of converted char to lower */ } diff --git a/src/iso8859_5.c b/src/iso8859_5.c index a5f587c..9e5d418 100644 --- a/src/iso8859_5.c +++ b/src/iso8859_5.c @@ -2,7 +2,7 @@ iso8859_5.c - Oniguruma (regular expression library) **********************************************************************/ /*- - * Copyright (c) 2002-2019 K.Kosako + * Copyright (c) 2002-2020 K.Kosako * All rights reserved. * * Redistribution and use in source and binary forms, with or without @@ -104,12 +104,16 @@ static const unsigned short EncISO_8859_5_CtypeTable[256] = { }; static int -mbc_case_fold(OnigCaseFoldType flag ARG_UNUSED, +mbc_case_fold(OnigCaseFoldType flag, const UChar** pp, const UChar* end ARG_UNUSED, UChar* lower) { const UChar* p = *pp; - *lower = ENC_ISO_8859_5_TO_LOWER_CASE(*p); + if (CASE_FOLD_IS_NOT_ASCII_ONLY(flag) || ONIGENC_IS_ASCII_CODE(*p)) + *lower = ENC_ISO_8859_5_TO_LOWER_CASE(*p); + else + *lower = *p; + (*pp)++; return 1; } diff --git a/src/iso8859_7.c b/src/iso8859_7.c index 018efac..07b1360 100644 --- a/src/iso8859_7.c +++ b/src/iso8859_7.c @@ -2,7 +2,7 @@ iso8859_7.c - Oniguruma (regular expression library) **********************************************************************/ /*- - * Copyright (c) 2002-2019 K.Kosako + * Copyright (c) 2002-2020 K.Kosako * All rights reserved. * * Redistribution and use in source and binary forms, with or without @@ -104,12 +104,16 @@ static const unsigned short EncISO_8859_7_CtypeTable[256] = { }; static int -mbc_case_fold(OnigCaseFoldType flag ARG_UNUSED, +mbc_case_fold(OnigCaseFoldType flag, const UChar** pp, const UChar* end ARG_UNUSED, UChar* lower) { const UChar* p = *pp; - *lower = ENC_ISO_8859_7_TO_LOWER_CASE(*p); + if (CASE_FOLD_IS_NOT_ASCII_ONLY(flag) || ONIGENC_IS_ASCII_CODE(*p)) + *lower = ENC_ISO_8859_7_TO_LOWER_CASE(*p); + else + *lower = *p; + (*pp)++; return 1; } diff --git a/src/iso8859_9.c b/src/iso8859_9.c index 1f9bdea..6f205e5 100644 --- a/src/iso8859_9.c +++ b/src/iso8859_9.c @@ -2,7 +2,7 @@ iso8859_9.c - Oniguruma (regular expression library) **********************************************************************/ /*- - * Copyright (c) 2002-2019 K.Kosako + * Copyright (c) 2002-2020 K.Kosako * All rights reserved. * * Redistribution and use in source and binary forms, with or without @@ -116,7 +116,11 @@ mbc_case_fold(OnigCaseFoldType flag, return 2; } - *lower = ENC_ISO_8859_9_TO_LOWER_CASE(*p); + if (CASE_FOLD_IS_NOT_ASCII_ONLY(flag) || ONIGENC_IS_ASCII_CODE(*p)) + *lower = ENC_ISO_8859_9_TO_LOWER_CASE(*p); + else + *lower = *p; + (*pp)++; return 1; } @@ -2,7 +2,7 @@ koi8.c - Oniguruma (regular expression library) **********************************************************************/ /*- - * Copyright (c) 2002-2019 K.Kosako + * Copyright (c) 2002-2020 K.Kosako * All rights reserved. * * Redistribution and use in source and binary forms, with or without @@ -105,12 +105,16 @@ static const unsigned short EncKOI8_CtypeTable[256] = { static int -koi8_mbc_case_fold(OnigCaseFoldType flag ARG_UNUSED, +koi8_mbc_case_fold(OnigCaseFoldType flag, const UChar** pp, const UChar* end ARG_UNUSED, UChar* lower) { const UChar* p = *pp; - *lower = ENC_KOI8_TO_LOWER_CASE(*p); + if (CASE_FOLD_IS_NOT_ASCII_ONLY(flag) || ONIGENC_IS_ASCII_CODE(*p)) + *lower = ENC_KOI8_TO_LOWER_CASE(*p); + else + *lower = *p; + (*pp)++; return 1; } diff --git a/src/koi8_r.c b/src/koi8_r.c index c77302f..31cc870 100644 --- a/src/koi8_r.c +++ b/src/koi8_r.c @@ -2,7 +2,7 @@ koi8_r.c - Oniguruma (regular expression library) **********************************************************************/ /*- - * Copyright (c) 2002-2019 K.Kosako + * Copyright (c) 2002-2020 K.Kosako * All rights reserved. * * Redistribution and use in source and binary forms, with or without @@ -109,7 +109,11 @@ koi8_r_mbc_case_fold(OnigCaseFoldType flag ARG_UNUSED, { const UChar* p = *pp; - *lower = ENC_KOI8_R_TO_LOWER_CASE(*p); + if (CASE_FOLD_IS_NOT_ASCII_ONLY(flag) || ONIGENC_IS_ASCII_CODE(*p)) + *lower = ENC_KOI8_R_TO_LOWER_CASE(*p); + else + *lower = *p; + (*pp)++; return 1; } diff --git a/src/oniguruma.h b/src/oniguruma.h index d983fc9..a7b9d8f 100644 --- a/src/oniguruma.h +++ b/src/oniguruma.h @@ -4,7 +4,7 @@ oniguruma.h - Oniguruma (regular expression library) **********************************************************************/ /*- - * Copyright (c) 2002-2020 K.Kosako + * Copyright (c) 2002-2021 K.Kosako * All rights reserved. * * Redistribution and use in source and binary forms, with or without @@ -36,9 +36,9 @@ extern "C" { #define ONIGURUMA #define ONIGURUMA_VERSION_MAJOR 6 #define ONIGURUMA_VERSION_MINOR 9 -#define ONIGURUMA_VERSION_TEENY 6 +#define ONIGURUMA_VERSION_TEENY 7 -#define ONIGURUMA_VERSION_INT 60906 +#define ONIGURUMA_VERSION_INT 60907 #ifndef P_ #if defined(__STDC__) || defined(_WIN32) @@ -91,6 +91,7 @@ typedef unsigned int OnigCaseFoldType; /* case fold flag */ ONIG_EXTERN OnigCaseFoldType OnigDefaultCaseFoldFlag; +#define ONIGENC_CASE_FOLD_ASCII_ONLY (1) /* #define ONIGENC_CASE_FOLD_HIRAGANA_KATAKANA (1<<1) */ /* #define ONIGENC_CASE_FOLD_KATAKANA_WIDTH (1<<2) */ #define ONIGENC_CASE_FOLD_TURKISH_AZERI (1<<20) @@ -387,9 +388,9 @@ typedef unsigned int OnigOptionType; #define ONIG_OPTION_NOTEOL (ONIG_OPTION_NOTBOL << 1) #define ONIG_OPTION_POSIX_REGION (ONIG_OPTION_NOTEOL << 1) #define ONIG_OPTION_CHECK_VALIDITY_OF_STRING (ONIG_OPTION_POSIX_REGION << 1) -/* #define ONIG_OPTION_CRLF_AS_LINE_SEPARATOR (ONIG_OPTION_CHECK_VALIDITY_OF_STRING << 1) */ /* options (compile time) */ -#define ONIG_OPTION_WORD_IS_ASCII (ONIG_OPTION_CHECK_VALIDITY_OF_STRING << 4) +#define ONIG_OPTION_IGNORECASE_IS_ASCII (ONIG_OPTION_CHECK_VALIDITY_OF_STRING << 3) +#define ONIG_OPTION_WORD_IS_ASCII (ONIG_OPTION_IGNORECASE_IS_ASCII << 1) #define ONIG_OPTION_DIGIT_IS_ASCII (ONIG_OPTION_WORD_IS_ASCII << 1) #define ONIG_OPTION_SPACE_IS_ASCII (ONIG_OPTION_DIGIT_IS_ASCII << 1) #define ONIG_OPTION_POSIX_IS_ASCII (ONIG_OPTION_SPACE_IS_ASCII << 1) @@ -399,8 +400,9 @@ typedef unsigned int OnigOptionType; #define ONIG_OPTION_NOT_BEGIN_STRING (ONIG_OPTION_TEXT_SEGMENT_WORD << 1) #define ONIG_OPTION_NOT_END_STRING (ONIG_OPTION_NOT_BEGIN_STRING << 1) #define ONIG_OPTION_NOT_BEGIN_POSITION (ONIG_OPTION_NOT_END_STRING << 1) +#define ONIG_OPTION_CALLBACK_EACH_MATCH (ONIG_OPTION_NOT_BEGIN_POSITION << 1) -#define ONIG_OPTION_MAXBIT ONIG_OPTION_NOT_BEGIN_POSITION +#define ONIG_OPTION_MAXBIT ONIG_OPTION_CALLBACK_EACH_MATCH #define ONIG_OPTION_ON(options,regopt) ((options) |= (regopt)) #define ONIG_OPTION_OFF(options,regopt) ((options) &= ~(regopt)) @@ -425,6 +427,7 @@ ONIG_EXTERN OnigSyntaxType OnigSyntaxJava; ONIG_EXTERN OnigSyntaxType OnigSyntaxPerl; ONIG_EXTERN OnigSyntaxType OnigSyntaxPerl_NG; ONIG_EXTERN OnigSyntaxType OnigSyntaxRuby; +ONIG_EXTERN OnigSyntaxType OnigSyntaxPython; ONIG_EXTERN OnigSyntaxType OnigSyntaxOniguruma; /* predefined syntaxes (see regsyntax.c) */ @@ -438,6 +441,7 @@ ONIG_EXTERN OnigSyntaxType OnigSyntaxOniguruma; #define ONIG_SYNTAX_PERL (&OnigSyntaxPerl) #define ONIG_SYNTAX_PERL_NG (&OnigSyntaxPerl_NG) #define ONIG_SYNTAX_RUBY (&OnigSyntaxRuby) +#define ONIG_SYNTAX_PYTHON (&OnigSyntaxPython) #define ONIG_SYNTAX_ONIGURUMA (&OnigSyntaxOniguruma) /* default syntax */ @@ -510,6 +514,7 @@ ONIG_EXTERN OnigSyntaxType* OnigDefaultSyntax; #define ONIG_SYN_OP2_QMARK_BRACE_CALLOUT_CONTENTS (1U<<28) /* (?{...}) (?{{...}}) */ #define ONIG_SYN_OP2_ASTERISK_CALLOUT_NAME (1U<<29) /* (*name) (*name{a,..}) */ #define ONIG_SYN_OP2_OPTION_ONIGURUMA (1U<<30) /* (?imxWDSPy) */ +#define ONIG_SYN_OP2_QMARK_CAPITAL_P_NAME (1U<<31) /* (?P<name>...) (?P=name) */ /* syntax (behavior) */ #define ONIG_SYN_CONTEXT_INDEP_ANCHORS (1U<<31) /* not implemented */ @@ -525,6 +530,7 @@ ONIG_EXTERN OnigSyntaxType* OnigDefaultSyntax; #define ONIG_SYN_FIXED_INTERVAL_IS_GREEDY_ONLY (1U<<9) /* a{n}?=(?:a{n})? */ #define ONIG_SYN_ISOLATED_OPTION_CONTINUE_BRANCH (1U<<10) /* ..(?i)...|... */ #define ONIG_SYN_VARIABLE_LEN_LOOK_BEHIND (1U<<11) /* (?<=a+|..) */ +#define ONIG_SYN_PYTHON (1U<<12) /* \UHHHHHHHH */ /* syntax (behavior) in char class [...] */ #define ONIG_SYN_NOT_NEWLINE_IN_NEGATIVE_CC (1U<<20) /* [^...] */ @@ -548,8 +554,10 @@ ONIG_EXTERN OnigSyntaxType* OnigDefaultSyntax; /* error codes */ #define ONIG_IS_PATTERN_ERROR(ecode) ((ecode) <= -100 && (ecode) > -1000) + /* normal return */ #define ONIG_NORMAL 0 +#define ONIG_VALUE_IS_NOT_SET 1 #define ONIG_MISMATCH -1 #define ONIG_NO_SUPPORT_CONFIG -2 #define ONIG_ABORT -3 @@ -607,6 +615,7 @@ ONIG_EXTERN OnigSyntaxType* OnigDefaultSyntax; #define ONIGERR_NUMBERED_BACKREF_OR_CALL_NOT_ALLOWED -209 #define ONIGERR_TOO_MANY_CAPTURES -210 #define ONIGERR_TOO_LONG_WIDE_CHAR_VALUE -212 +#define ONIGERR_UNDEFINED_OPERATOR -213 #define ONIGERR_EMPTY_GROUP_NAME -214 #define ONIGERR_INVALID_GROUP_NAME -215 #define ONIGERR_INVALID_CHAR_IN_GROUP_NAME -216 @@ -633,6 +642,7 @@ ONIG_EXTERN OnigSyntaxType* OnigDefaultSyntax; #define ONIGERR_INVALID_COMBINATION_OF_OPTIONS -403 #define ONIGERR_TOO_MANY_USER_DEFINED_OBJECTS -404 #define ONIGERR_TOO_LONG_PROPERTY_NAME -405 +#define ONIGERR_VERY_INEFFICIENT_PATTERN -406 #define ONIGERR_LIBRARY_IS_NOT_INITIALIZED -500 /* errors related to thread */ @@ -717,6 +727,8 @@ typedef struct { OnigCaseFoldType case_fold_flag; } OnigCompileInfo; +typedef int (*OnigCallbackEachMatchFunc)(const OnigUChar* str, const OnigUChar* end, const OnigUChar* match_start, OnigRegion* region, void* user_data); + /* types for callout */ typedef enum { @@ -940,6 +952,12 @@ const char* onig_version P_((void)); ONIG_EXTERN const char* onig_copyright P_((void)); +/* for callback each match */ +ONIG_EXTERN +OnigCallbackEachMatchFunc onig_get_callback_each_match P_((void)); +ONIG_EXTERN +int onig_set_callback_each_match P_((OnigCallbackEachMatchFunc f)); + /* for OnigMatchParam */ ONIG_EXTERN OnigMatchParam* onig_new_match_param P_((void)); @@ -981,6 +999,8 @@ ONIG_EXTERN int onig_get_callout_data_by_tag P_((OnigRegex reg, OnigMatchParam* mp, const OnigUChar* tag, const OnigUChar* tag_end, int slot, OnigType* type, OnigValue* val)); ONIG_EXTERN int onig_set_callout_data_by_tag P_((OnigRegex reg, OnigMatchParam* mp, const OnigUChar* tag, const OnigUChar* tag_end, int slot, OnigType type, OnigValue* val)); +ONIG_EXTERN +int onig_get_callout_data_by_tag_dont_clear_old P_((regex_t* reg, OnigMatchParam* mp, const OnigUChar* tag, const OnigUChar* tag_end, int slot, OnigType* type, OnigValue* val)); /* used in callout functions */ ONIG_EXTERN diff --git a/src/regcomp.c b/src/regcomp.c index dd2b328..d80551d 100644 --- a/src/regcomp.c +++ b/src/regcomp.c @@ -2,7 +2,7 @@ regcomp.c - Oniguruma (regular expression library) **********************************************************************/ /*- - * Copyright (c) 2002-2020 K.Kosako + * Copyright (c) 2002-2021 K.Kosako * All rights reserved. * * Redistribution and use in source and binary forms, with or without @@ -31,6 +31,9 @@ #define OPS_INIT_SIZE 8 +#define NODE_IS_REAL_IGNORECASE(node) \ + (NODE_IS_IGNORECASE(node) && !NODE_STRING_IS_CRUDE(node)) + typedef struct { OnigLen min; OnigLen max; @@ -44,7 +47,7 @@ typedef struct { OnigCaseFoldType OnigDefaultCaseFoldFlag = ONIGENC_CASE_FOLD_MIN; -static OnigLen node_min_byte_len(Node* node, ScanEnv* env); +static OnigLen node_min_byte_len(Node* node, ParseEnv* env); #if 0 typedef struct { @@ -129,27 +132,22 @@ ops_init(regex_t* reg, int init_alloc_size) Operation* p; size_t size; - if (init_alloc_size > 0) { - size = sizeof(Operation) * init_alloc_size; - p = (Operation* )xrealloc(reg->ops, size); - CHECK_NULL_RETURN_MEMERR(p); - reg->ops = p; + if (init_alloc_size <= 0) + return ONIGERR_PARSER_BUG; + + size = sizeof(Operation) * init_alloc_size; + p = (Operation* )xrealloc(reg->ops, size); + CHECK_NULL_RETURN_MEMERR(p); + reg->ops = p; #ifdef USE_DIRECT_THREADED_CODE - { - enum OpCode* cp; - size = sizeof(enum OpCode) * init_alloc_size; - cp = (enum OpCode* )xrealloc(reg->ocs, size); - CHECK_NULL_RETURN_MEMERR(cp); - reg->ocs = cp; - } -#endif + { + enum OpCode* cp; + size = sizeof(enum OpCode) * init_alloc_size; + cp = (enum OpCode* )xrealloc(reg->ocs, size); + CHECK_NULL_RETURN_MEMERR(cp); + reg->ocs = cp; } - else { - reg->ops = (Operation* )0; -#ifdef USE_DIRECT_THREADED_CODE - reg->ocs = (enum OpCode* )0; #endif - } reg->ops_curr = 0; /* !!! not yet done ops_new() */ reg->ops_alloc = init_alloc_size; @@ -159,19 +157,16 @@ ops_init(regex_t* reg, int init_alloc_size) } static int -ops_expand(regex_t* reg, int n) +ops_resize(regex_t* reg, int n) { -#define MIN_OPS_EXPAND_SIZE 4 - #ifdef USE_DIRECT_THREADED_CODE enum OpCode* cp; #endif Operation* p; size_t size; - if (n <= 0) n = MIN_OPS_EXPAND_SIZE; - - n += reg->ops_alloc; + if (n == reg->ops_alloc) return ONIG_NORMAL; + if (n <= 0) return ONIGERR_PARSER_BUG; size = sizeof(Operation) * n; p = (Operation* )xrealloc(reg->ops, size); @@ -197,10 +192,8 @@ ops_expand(regex_t* reg, int n) static int ops_new(regex_t* reg) { - int r; - if (reg->ops_used >= reg->ops_alloc) { - r = ops_expand(reg, reg->ops_alloc); + int r = ops_resize(reg, reg->ops_alloc << 1); if (r != ONIG_NORMAL) return r; } @@ -669,6 +662,8 @@ mmcl_alt_merge(MinMaxCharLen* to, MinMaxCharLen* alt) if (to->max < alt->max) to->max = alt->max; } +#ifndef ONIG_DONT_OPTIMIZE + static int mml_is_equal(MinMaxLen* a, MinMaxLen* b) { @@ -709,9 +704,11 @@ mml_alt_merge(MinMaxLen* to, MinMaxLen* alt) if (to->max < alt->max) to->max = alt->max; } +#endif + /* fixed size pattern node only */ static int -node_char_len1(Node* node, regex_t* reg, MinMaxCharLen* ci, ScanEnv* env, +node_char_len1(Node* node, regex_t* reg, MinMaxCharLen* ci, ParseEnv* env, int level) { MinMaxCharLen tci; @@ -768,7 +765,8 @@ node_char_len1(Node* node, regex_t* reg, MinMaxCharLen* ci, ScanEnv* env, StrNode* sn = STR_(node); UChar *s = sn->s; - if (NODE_IS_IGNORECASE(node) && ! NODE_STRING_IS_CRUDE(node)) { + if (NODE_IS_REAL_IGNORECASE(node) && + CASE_FOLD_IS_NOT_ASCII_ONLY(env->case_fold_flag)) { /* Such a case is possible. ex. /(?i)(?<=\1)(a)/ Backref node refer to capture group, but it doesn't tune yet. @@ -917,7 +915,7 @@ node_char_len1(Node* node, regex_t* reg, MinMaxCharLen* ci, ScanEnv* env, { int i; int* backs; - MemEnv* mem_env = SCANENV_MEMENV(env); + MemEnv* mem_env = PARSEENV_MEMENV(env); BackRefNode* br = BACKREF_(node); backs = BACKREFS_P(br); @@ -943,7 +941,7 @@ node_char_len1(Node* node, regex_t* reg, MinMaxCharLen* ci, ScanEnv* env, } static int -node_char_len(Node* node, regex_t* reg, MinMaxCharLen* ci, ScanEnv* env) +node_char_len(Node* node, regex_t* reg, MinMaxCharLen* ci, ParseEnv* env) { return node_char_len1(node, reg, ci, env, 0); } @@ -967,7 +965,7 @@ add_op(regex_t* reg, int opcode) } static int compile_length_tree(Node* node, regex_t* reg); -static int compile_tree(Node* node, regex_t* reg, ScanEnv* env); +static int compile_tree(Node* node, regex_t* reg, ParseEnv* env); #define IS_NEED_STR_LEN_OP(op) \ @@ -1035,7 +1033,7 @@ is_strict_real_node(Node* node) } static int -compile_quant_body_with_empty_check(QuantNode* qn, regex_t* reg, ScanEnv* env) +compile_quant_body_with_empty_check(QuantNode* qn, regex_t* reg, ParseEnv* env) { int r; int saved_num_empty_check; @@ -1060,14 +1058,20 @@ compile_quant_body_with_empty_check(QuantNode* qn, regex_t* reg, ScanEnv* env) if (emptiness == BODY_MAY_BE_EMPTY) r = add_op(reg, OP_EMPTY_CHECK_END); else if (emptiness == BODY_MAY_BE_EMPTY_MEM) { - if (NODE_IS_EMPTY_STATUS_CHECK(qn) != 0) + if (NODE_IS_EMPTY_STATUS_CHECK(qn) != 0 && qn->empty_status_mem != 0) { r = add_op(reg, OP_EMPTY_CHECK_END_MEMST); + if (r != 0) return r; + COP(reg)->empty_check_end.empty_status_mem = qn->empty_status_mem; + } else r = add_op(reg, OP_EMPTY_CHECK_END); } #ifdef USE_CALL - else if (emptiness == BODY_MAY_BE_EMPTY_REC) + else if (emptiness == BODY_MAY_BE_EMPTY_REC) { r = add_op(reg, OP_EMPTY_CHECK_END_MEMST_PUSH); + if (r != 0) return r; + COP(reg)->empty_check_end.empty_status_mem = qn->empty_status_mem; + } #endif if (r != 0) return r; @@ -1078,7 +1082,7 @@ compile_quant_body_with_empty_check(QuantNode* qn, regex_t* reg, ScanEnv* env) #ifdef USE_CALL static int -compile_call(CallNode* node, regex_t* reg, ScanEnv* env) +compile_call(CallNode* node, regex_t* reg, ParseEnv* env) { int r; int offset; @@ -1098,7 +1102,7 @@ compile_call(CallNode* node, regex_t* reg, ScanEnv* env) #endif static int -compile_tree_n_times(Node* node, int n, regex_t* reg, ScanEnv* env) +compile_tree_n_times(Node* node, int n, regex_t* reg, ParseEnv* env) { int i, r; @@ -1356,7 +1360,7 @@ entry_repeat_range(regex_t* reg, int id, int lower, int upper, int ops_index) static int compile_range_repeat_node(QuantNode* qn, int target_len, int emptiness, - regex_t* reg, ScanEnv* env) + regex_t* reg, ParseEnv* env) { int r; int num_repeat = reg->num_repeat++; @@ -1469,7 +1473,7 @@ compile_length_quantifier_node(QuantNode* qn, regex_t* reg) } static int -compile_quantifier_node(QuantNode* qn, regex_t* reg, ScanEnv* env) +compile_quantifier_node(QuantNode* qn, regex_t* reg, ParseEnv* env) { int i, r, mod_tlen; int infinite = IS_INFINITE_REPEAT(qn->upper); @@ -1649,7 +1653,7 @@ compile_length_option_node(BagNode* node, regex_t* reg) } static int -compile_option_node(BagNode* node, regex_t* reg, ScanEnv* env) +compile_option_node(BagNode* node, regex_t* reg, ParseEnv* env) { int r; @@ -1765,7 +1769,7 @@ compile_length_bag_node(BagNode* node, regex_t* reg) } static int -compile_bag_memory_node(BagNode* node, regex_t* reg, ScanEnv* env) +compile_bag_memory_node(BagNode* node, regex_t* reg, ParseEnv* env) { int r; @@ -1845,7 +1849,7 @@ compile_bag_memory_node(BagNode* node, regex_t* reg, ScanEnv* env) } static int -compile_bag_node(BagNode* node, regex_t* reg, ScanEnv* env) +compile_bag_node(BagNode* node, regex_t* reg, ParseEnv* env) { int r, len; @@ -2036,7 +2040,7 @@ compile_length_anchor_node(AnchorNode* node, regex_t* reg) } static int -compile_anchor_look_behind_node(AnchorNode* node, regex_t* reg, ScanEnv* env) +compile_anchor_look_behind_node(AnchorNode* node, regex_t* reg, ParseEnv* env) { int r; @@ -2150,7 +2154,7 @@ compile_anchor_look_behind_node(AnchorNode* node, regex_t* reg, ScanEnv* env) static int compile_anchor_look_behind_not_node(AnchorNode* node, regex_t* reg, - ScanEnv* env) + ParseEnv* env) { int r; int len; @@ -2279,7 +2283,7 @@ compile_anchor_look_behind_not_node(AnchorNode* node, regex_t* reg, } static int -compile_anchor_node(AnchorNode* node, regex_t* reg, ScanEnv* env) +compile_anchor_node(AnchorNode* node, regex_t* reg, ParseEnv* env) { int r, len; enum OpCode op; @@ -2573,7 +2577,7 @@ compile_length_tree(Node* node, regex_t* reg) } static int -compile_tree(Node* node, regex_t* reg, ScanEnv* env) +compile_tree(Node* node, regex_t* reg, ParseEnv* env) { int n, len, pos, r = 0; @@ -2983,7 +2987,7 @@ numbered_ref_check(Node* node) } static int -disable_noname_group_capture(Node** root, regex_t* reg, ScanEnv* env) +disable_noname_group_capture(Node** root, regex_t* reg, ParseEnv* env) { int r, i, pos, counter; MemStatusType loc; @@ -3003,7 +3007,7 @@ disable_noname_group_capture(Node** root, regex_t* reg, ScanEnv* env) for (i = 1, pos = 1; i <= env->num_mem; i++) { if (map[i].new_val > 0) { - SCANENV_MEMENV(env)[pos] = SCANENV_MEMENV(env)[i]; + PARSEENV_MEMENV(env)[pos] = PARSEENV_MEMENV(env)[i]; pos++; } } @@ -3285,8 +3289,7 @@ get_tree_head_literal(Node* node, int exact, regex_t* reg) if (sn->end <= sn->s) break; - if (exact == 0 || - ! NODE_IS_IGNORECASE(node) || NODE_STRING_IS_CRUDE(node)) { + if (exact == 0 || !NODE_IS_REAL_IGNORECASE(node)) { n = node; } } @@ -3381,7 +3384,7 @@ get_tree_tail_literal(Node* node, Node** rnode, regex_t* reg) break; } - if (NODE_IS_IGNORECASE(node) && ! NODE_STRING_IS_CRUDE(node)) { + if (NODE_IS_REAL_IGNORECASE(node)) { r = GET_VALUE_NONE; break; } @@ -3601,7 +3604,7 @@ check_node_in_look_behind(Node* node, int not, int* used) } static OnigLen -node_min_byte_len(Node* node, ScanEnv* env) +node_min_byte_len(Node* node, ParseEnv* env) { OnigLen len; OnigLen tmin; @@ -3612,7 +3615,7 @@ node_min_byte_len(Node* node, ScanEnv* env) if (! NODE_IS_CHECKER(node)) { int i; int* backs; - MemEnv* mem_env = SCANENV_MEMENV(env); + MemEnv* mem_env = PARSEENV_MEMENV(env); BackRefNode* br = BACKREF_(node); if (NODE_IS_RECURSION(node)) break; @@ -3629,10 +3632,8 @@ node_min_byte_len(Node* node, ScanEnv* env) case NODE_CALL: { Node* t = NODE_BODY(node); - if (NODE_IS_RECURSION(node)) { - if (NODE_IS_FIXED_MIN(t)) - len = BAG_(t)->min_len; - } + if (NODE_IS_FIXED_MIN(t)) + len = BAG_(t)->min_len; else len = node_min_byte_len(t, env); } @@ -3742,143 +3743,8 @@ node_min_byte_len(Node* node, ScanEnv* env) return len; } -static OnigLen -node_max_byte_len(Node* node, ScanEnv* env) -{ - OnigLen len; - OnigLen tmax; - - len = 0; - switch (NODE_TYPE(node)) { - case NODE_LIST: - do { - tmax = node_max_byte_len(NODE_CAR(node), env); - len = distance_add(len, tmax); - } while (IS_NOT_NULL(node = NODE_CDR(node))); - break; - - case NODE_ALT: - do { - tmax = node_max_byte_len(NODE_CAR(node), env); - if (len < tmax) len = tmax; - } while (IS_NOT_NULL(node = NODE_CDR(node))); - break; - - case NODE_STRING: - { - StrNode* sn = STR_(node); - len = (OnigLen )(sn->end - sn->s); - } - break; - - case NODE_CTYPE: - case NODE_CCLASS: - len = ONIGENC_MBC_MAXLEN_DIST(env->enc); - break; - - case NODE_BACKREF: - if (! NODE_IS_CHECKER(node)) { - int i; - int* backs; - MemEnv* mem_env = SCANENV_MEMENV(env); - BackRefNode* br = BACKREF_(node); - if (NODE_IS_RECURSION(node)) { -#ifdef USE_BACKREF_WITH_LEVEL - if (NODE_IS_NEST_LEVEL(node)) { - len = INFINITE_LEN; - } -#endif - break; - } - backs = BACKREFS_P(br); - for (i = 0; i < br->back_num; i++) { - tmax = node_max_byte_len(mem_env[backs[i]].mem_node, env); - if (len < tmax) len = tmax; - } - } - break; - -#ifdef USE_CALL - case NODE_CALL: - if (! NODE_IS_RECURSION(node)) - len = node_max_byte_len(NODE_BODY(node), env); - else - len = INFINITE_LEN; - break; -#endif - - case NODE_QUANT: - { - QuantNode* qn = QUANT_(node); - - if (qn->upper != 0) { - len = node_max_byte_len(NODE_BODY(node), env); - if (len != 0) { - if (! IS_INFINITE_REPEAT(qn->upper)) - len = distance_multiply(len, qn->upper); - else - len = INFINITE_LEN; - } - } - } - break; - - case NODE_BAG: - { - BagNode* en = BAG_(node); - switch (en->type) { - case BAG_MEMORY: - if (NODE_IS_FIXED_MAX(node)) - len = en->max_len; - else { - if (NODE_IS_MARK1(node)) - len = INFINITE_LEN; - else { - NODE_STATUS_ADD(node, MARK1); - len = node_max_byte_len(NODE_BODY(node), env); - NODE_STATUS_REMOVE(node, MARK1); - - en->max_len = len; - NODE_STATUS_ADD(node, FIXED_MAX); - } - } - break; - - case BAG_OPTION: - case BAG_STOP_BACKTRACK: - len = node_max_byte_len(NODE_BODY(node), env); - break; - case BAG_IF_ELSE: - { - OnigLen tlen, elen; - - len = node_max_byte_len(NODE_BODY(node), env); - if (IS_NOT_NULL(en->te.Then)) { - tlen = node_max_byte_len(en->te.Then, env); - len = distance_add(len, tlen); - } - if (IS_NOT_NULL(en->te.Else)) - elen = node_max_byte_len(en->te.Else, env); - else elen = 0; - - if (elen > len) len = elen; - } - break; - } - } - break; - - case NODE_ANCHOR: - case NODE_GIMMICK: - default: - break; - } - - return len; -} - static int -check_backrefs(Node* node, ScanEnv* env) +check_backrefs(Node* node, ParseEnv* env) { int r; @@ -3923,7 +3789,7 @@ check_backrefs(Node* node, ScanEnv* env) int i; BackRefNode* br = BACKREF_(node); int* backs = BACKREFS_P(br); - MemEnv* mem_env = SCANENV_MEMENV(env); + MemEnv* mem_env = PARSEENV_MEMENV(env); for (i = 0; i < br->back_num; i++) { if (backs[i] > env->num_mem) @@ -3944,7 +3810,7 @@ check_backrefs(Node* node, ScanEnv* env) } static int -set_empty_repeat_node_trav(Node* node, Node* empty, ScanEnv* env) +set_empty_repeat_node_trav(Node* node, Node* empty, ParseEnv* env) { int r; @@ -3998,7 +3864,7 @@ set_empty_repeat_node_trav(Node* node, Node* empty, ScanEnv* env) if (en->type == BAG_MEMORY) { if (NODE_IS_BACKREF(node)) { if (IS_NOT_NULL(empty)) - SCANENV_MEMENV(env)[en->m.regnum].empty_repeat_node = empty; + PARSEENV_MEMENV(env)[en->m.regnum].empty_repeat_node = empty; } } else if (en->type == BAG_IF_ELSE) { @@ -4034,7 +3900,7 @@ is_ancestor_node(Node* node, Node* me) } static void -set_empty_status_check_trav(Node* node, ScanEnv* env) +set_empty_status_check_trav(Node* node, ParseEnv* env) { switch (NODE_TYPE(node)) { case NODE_LIST: @@ -4078,14 +3944,14 @@ set_empty_status_check_trav(Node* node, ScanEnv* env) { int i; int* backs; - MemEnv* mem_env = SCANENV_MEMENV(env); + MemEnv* mem_env = PARSEENV_MEMENV(env); BackRefNode* br = BACKREF_(node); backs = BACKREFS_P(br); for (i = 0; i < br->back_num; i++) { Node* ernode = mem_env[backs[i]].empty_repeat_node; if (IS_NOT_NULL(ernode)) { if (! is_ancestor_node(ernode, node)) { - MEM_STATUS_LIMIT_ON(env->reg->empty_status_mem, backs[i]); + MEM_STATUS_LIMIT_ON(QUANT_(ernode)->empty_status_mem, backs[i]); NODE_STATUS_ADD(ernode, EMPTY_STATUS_CHECK); NODE_STATUS_ADD(mem_env[backs[i]].mem_node, EMPTY_STATUS_CHECK); } @@ -4150,7 +4016,7 @@ set_parent_node_trav(Node* node, Node* parent) #define RECURSION_INFINITE (1<<2) static int -infinite_recursive_call_check(Node* node, ScanEnv* env, int head) +infinite_recursive_call_check(Node* node, ParseEnv* env, int head) { int ret; int r = 0; @@ -4191,6 +4057,8 @@ infinite_recursive_call_check(Node* node, ScanEnv* env, int head) break; case NODE_QUANT: + if (QUANT_(node)->upper == 0) break; + r = infinite_recursive_call_check(NODE_BODY(node), env, head); if (r < 0) return r; if ((r & RECURSION_MUST) != 0) { @@ -4265,7 +4133,7 @@ infinite_recursive_call_check(Node* node, ScanEnv* env, int head) } static int -infinite_recursive_call_check_trav(Node* node, ScanEnv* env) +infinite_recursive_call_check_trav(Node* node, ParseEnv* env) { int r; @@ -4403,7 +4271,7 @@ recursive_call_check(Node* node) #define FOUND_CALLED_NODE 1 static int -recursive_call_check_trav(Node* node, ScanEnv* env, int state) +recursive_call_check_trav(Node* node, ParseEnv* env, int state) { int r = 0; @@ -4443,19 +4311,21 @@ recursive_call_check_trav(Node* node, ScanEnv* env, int state) BagNode* en = BAG_(node); if (en->type == BAG_MEMORY) { - if (NODE_IS_CALLED(node) || (state & IN_RECURSION) != 0) { + if (NODE_IS_CALLED(node)) { + r = FOUND_CALLED_NODE; + goto check_recursion; + } + else if ((state & IN_RECURSION) != 0) { + check_recursion: if (! NODE_IS_RECURSION(node)) { NODE_STATUS_ADD(node, MARK1); - r = recursive_call_check(NODE_BODY(node)); - if (r != 0) { + ret = recursive_call_check(NODE_BODY(node)); + if (ret != 0) { NODE_STATUS_ADD(node, RECURSION); MEM_STATUS_ON(env->backtrack_mem, en->m.regnum); } NODE_STATUS_REMOVE(node, MARK1); } - - if (NODE_IS_CALLED(node)) - r = FOUND_CALLED_NODE; } } @@ -4616,8 +4486,9 @@ reduce_string_list(Node* node, OnigEncoding enc) #define IN_VAR_REPEAT (1<<3) #define IN_ZERO_REPEAT (1<<4) #define IN_MULTI_ENTRY (1<<5) -#define IN_LOOK_BEHIND (1<<6) - +#define IN_PREC_READ (1<<6) +#define IN_LOOK_BEHIND (1<<7) +#define IN_PEEK (1<<8) /* divide different length alternatives in look-behind. (?<=A|B) ==> (?<=A)|(?<=B) @@ -4706,7 +4577,7 @@ list_reduce_in_look_behind(Node* node) } static int -alt_reduce_in_look_behind(Node* node, regex_t* reg, ScanEnv* env) +alt_reduce_in_look_behind(Node* node, regex_t* reg, ParseEnv* env) { int r; @@ -4725,10 +4596,10 @@ alt_reduce_in_look_behind(Node* node, regex_t* reg, ScanEnv* env) return r; } -static int tune_tree(Node* node, regex_t* reg, int state, ScanEnv* env); +static int tune_tree(Node* node, regex_t* reg, int state, ParseEnv* env); static int -tune_look_behind(Node* node, regex_t* reg, int state, ScanEnv* env) +tune_look_behind(Node* node, regex_t* reg, int state, ParseEnv* env) { int r; int state1; @@ -5183,7 +5054,7 @@ unravel_case_fold_string(Node* node, regex_t* reg, int state) return r; } -#ifdef USE_STUBBORN_CHECK_CAPTURES_IN_EMPTY_REPEAT +#ifdef USE_RIGID_CHECK_CAPTURES_IN_EMPTY_REPEAT static enum BodyEmptyType quantifiers_memory_node_info(Node* node) { @@ -5265,7 +5136,7 @@ quantifiers_memory_node_info(Node* node) return r; } -#endif /* USE_STUBBORN_CHECK_CAPTURES_IN_EMPTY_REPEAT */ +#endif /* USE_RIGID_CHECK_CAPTURES_IN_EMPTY_REPEAT */ #ifdef USE_CALL @@ -5274,9 +5145,9 @@ quantifiers_memory_node_info(Node* node) __inline #endif static int -check_call_reference(CallNode* cn, ScanEnv* env, int state) +check_call_reference(CallNode* cn, ParseEnv* env, int state) { - MemEnv* mem_env = SCANENV_MEMENV(env); + MemEnv* mem_env = PARSEENV_MEMENV(env); if (cn->by_number != 0) { int gnum = cn->called_gnum; @@ -5393,7 +5264,7 @@ tune_call2_call(Node* node) } static int -tune_call(Node* node, ScanEnv* env, int state) +tune_call(Node* node, ParseEnv* env, int state) { int r; @@ -5539,6 +5410,8 @@ tune_called_state_call(Node* node, int state) state |= IN_REAL_REPEAT; if (qn->lower != qn->upper) state |= IN_VAR_REPEAT; + if ((state & IN_PEEK) != 0) + NODE_STATUS_ADD(node, INPEEK); tune_called_state_call(NODE_QUANT_BODY(qn), state); } @@ -5551,10 +5424,12 @@ tune_called_state_call(Node* node, int state) switch (an->type) { case ANCR_PREC_READ_NOT: case ANCR_LOOK_BEHIND_NOT: - state |= IN_NOT; - /* fall */ + state |= (IN_NOT | IN_PEEK); + tune_called_state_call(NODE_ANCHOR_BODY(an), state); + break; case ANCR_PREC_READ: case ANCR_LOOK_BEHIND: + state |= IN_PEEK; tune_called_state_call(NODE_ANCHOR_BODY(an), state); break; default: @@ -5597,6 +5472,11 @@ tune_called_state_call(Node* node, int state) break; case NODE_CALL: + if ((state & IN_PEEK) != 0) + NODE_STATUS_ADD(node, INPEEK); + if ((state & IN_REAL_REPEAT) != 0) + NODE_STATUS_ADD(node, IN_REAL_REPEAT); + tune_called_state_call(NODE_BODY(node), state); break; @@ -5620,6 +5500,11 @@ tune_called_state(Node* node, int state) #ifdef USE_CALL case NODE_CALL: + if ((state & IN_PEEK) != 0) + NODE_STATUS_ADD(node, INPEEK); + if ((state & IN_REAL_REPEAT) != 0) + NODE_STATUS_ADD(node, IN_REAL_REPEAT); + tune_called_state_call(node, state); break; #endif @@ -5659,6 +5544,8 @@ tune_called_state(Node* node, int state) state |= IN_REAL_REPEAT; if (qn->lower != qn->upper) state |= IN_VAR_REPEAT; + if ((state & IN_PEEK) != 0) + NODE_STATUS_ADD(node, INPEEK); tune_called_state(NODE_QUANT_BODY(qn), state); } @@ -5671,10 +5558,12 @@ tune_called_state(Node* node, int state) switch (an->type) { case ANCR_PREC_READ_NOT: case ANCR_LOOK_BEHIND_NOT: - state |= IN_NOT; - /* fall */ + state |= (IN_NOT | IN_PEEK); + tune_called_state(NODE_ANCHOR_BODY(an), state); + break; case ANCR_PREC_READ: case ANCR_LOOK_BEHIND: + state |= IN_PEEK; tune_called_state(NODE_ANCHOR_BODY(an), state); break; default: @@ -5700,17 +5589,18 @@ tune_called_state(Node* node, int state) __inline #endif static int -tune_anchor(Node* node, regex_t* reg, int state, ScanEnv* env) +tune_anchor(Node* node, regex_t* reg, int state, ParseEnv* env) { int r; AnchorNode* an = ANCHOR_(node); switch (an->type) { case ANCR_PREC_READ: - r = tune_tree(NODE_ANCHOR_BODY(an), reg, state, env); + r = tune_tree(NODE_ANCHOR_BODY(an), reg, (state | IN_PREC_READ), env); break; case ANCR_PREC_READ_NOT: - r = tune_tree(NODE_ANCHOR_BODY(an), reg, (state | IN_NOT), env); + r = tune_tree(NODE_ANCHOR_BODY(an), reg, (state | IN_PREC_READ | IN_NOT), + env); break; case ANCR_LOOK_BEHIND: @@ -5730,7 +5620,7 @@ tune_anchor(Node* node, regex_t* reg, int state, ScanEnv* env) __inline #endif static int -tune_quant(Node* node, regex_t* reg, int state, ScanEnv* env) +tune_quant(Node* node, regex_t* reg, int state, ParseEnv* env) { int r; QuantNode* qn = QUANT_(node); @@ -5746,7 +5636,7 @@ tune_quant(Node* node, regex_t* reg, int state, ScanEnv* env) if (IS_INFINITE_REPEAT(qn->upper) || qn->upper >= 1) { OnigLen d = node_min_byte_len(body, env); if (d == 0) { -#ifdef USE_STUBBORN_CHECK_CAPTURES_IN_EMPTY_REPEAT +#ifdef USE_RIGID_CHECK_CAPTURES_IN_EMPTY_REPEAT qn->emptiness = quantifiers_memory_node_info(body); #else qn->emptiness = BODY_MAY_BE_EMPTY; @@ -5807,7 +5697,7 @@ tune_quant(Node* node, regex_t* reg, int state, ScanEnv* env) 6. expand repeated string. */ static int -tune_tree(Node* node, regex_t* reg, int state, ScanEnv* env) +tune_tree(Node* node, regex_t* reg, int state, ParseEnv* env) { int r = 0; @@ -5832,7 +5722,7 @@ tune_tree(Node* node, regex_t* reg, int state, ScanEnv* env) break; case NODE_STRING: - if (NODE_IS_IGNORECASE(node) && ! NODE_STRING_IS_CRUDE(node)) { + if (NODE_IS_REAL_IGNORECASE(node)) { r = unravel_case_fold_string(node, reg, state); } break; @@ -5918,6 +5808,9 @@ tune_tree(Node* node, regex_t* reg, int state, ScanEnv* env) break; case NODE_QUANT: + if ((state & (IN_PREC_READ | IN_LOOK_BEHIND)) != 0) + NODE_STATUS_ADD(node, INPEEK); + r = tune_quant(node, reg, state, env); break; @@ -5938,6 +5831,7 @@ tune_tree(Node* node, regex_t* reg, int state, ScanEnv* env) return r; } +#ifndef ONIG_DONT_OPTIMIZE static int set_sunday_quick_search_or_bmh_skip_table(regex_t* reg, int case_expand, UChar* s, UChar* end, @@ -6007,6 +5901,7 @@ set_sunday_quick_search_or_bmh_skip_table(regex_t* reg, int case_expand, return 0; } +#endif #define OPT_EXACT_MAXLEN 24 @@ -6019,7 +5914,7 @@ typedef struct { MinMaxLen mm; OnigEncoding enc; OnigCaseFoldType case_fold_flag; - ScanEnv* scan_env; + ParseEnv* scan_env; } OptEnv; typedef struct { @@ -6052,6 +5947,8 @@ typedef struct { } OptNode; +#ifndef ONIG_DONT_OPTIMIZE + static int map_position_value(OnigEncoding enc, int i) { @@ -6540,6 +6437,140 @@ alt_merge_node_opt_info(OptNode* to, OptNode* add, OptEnv* env) mml_alt_merge(&to->len, &add->len); } +static OnigLen +node_max_byte_len(Node* node, ParseEnv* env) +{ + OnigLen len; + OnigLen tmax; + + len = 0; + switch (NODE_TYPE(node)) { + case NODE_LIST: + do { + tmax = node_max_byte_len(NODE_CAR(node), env); + len = distance_add(len, tmax); + } while (IS_NOT_NULL(node = NODE_CDR(node))); + break; + + case NODE_ALT: + do { + tmax = node_max_byte_len(NODE_CAR(node), env); + if (len < tmax) len = tmax; + } while (IS_NOT_NULL(node = NODE_CDR(node))); + break; + + case NODE_STRING: + { + StrNode* sn = STR_(node); + len = (OnigLen )(sn->end - sn->s); + } + break; + + case NODE_CTYPE: + case NODE_CCLASS: + len = ONIGENC_MBC_MAXLEN_DIST(env->enc); + break; + + case NODE_BACKREF: + if (! NODE_IS_CHECKER(node)) { + int i; + int* backs; + MemEnv* mem_env = PARSEENV_MEMENV(env); + BackRefNode* br = BACKREF_(node); + if (NODE_IS_RECURSION(node)) { +#ifdef USE_BACKREF_WITH_LEVEL + if (NODE_IS_NEST_LEVEL(node)) { + len = INFINITE_LEN; + } +#endif + break; + } + backs = BACKREFS_P(br); + for (i = 0; i < br->back_num; i++) { + tmax = node_max_byte_len(mem_env[backs[i]].mem_node, env); + if (len < tmax) len = tmax; + } + } + break; + +#ifdef USE_CALL + case NODE_CALL: + if (! NODE_IS_RECURSION(node)) + len = node_max_byte_len(NODE_BODY(node), env); + else + len = INFINITE_LEN; + break; +#endif + + case NODE_QUANT: + { + QuantNode* qn = QUANT_(node); + + if (qn->upper != 0) { + len = node_max_byte_len(NODE_BODY(node), env); + if (len != 0) { + if (! IS_INFINITE_REPEAT(qn->upper)) + len = distance_multiply(len, qn->upper); + else + len = INFINITE_LEN; + } + } + } + break; + + case NODE_BAG: + { + BagNode* en = BAG_(node); + switch (en->type) { + case BAG_MEMORY: + if (NODE_IS_FIXED_MAX(node)) + len = en->max_len; + else { + if (NODE_IS_MARK1(node)) + len = INFINITE_LEN; + else { + NODE_STATUS_ADD(node, MARK1); + len = node_max_byte_len(NODE_BODY(node), env); + NODE_STATUS_REMOVE(node, MARK1); + + en->max_len = len; + NODE_STATUS_ADD(node, FIXED_MAX); + } + } + break; + + case BAG_OPTION: + case BAG_STOP_BACKTRACK: + len = node_max_byte_len(NODE_BODY(node), env); + break; + case BAG_IF_ELSE: + { + OnigLen tlen, elen; + + len = node_max_byte_len(NODE_BODY(node), env); + if (IS_NOT_NULL(en->te.Then)) { + tlen = node_max_byte_len(en->te.Then, env); + len = distance_add(len, tlen); + } + if (IS_NOT_NULL(en->te.Else)) + elen = node_max_byte_len(en->te.Else, env); + else elen = 0; + + if (elen > len) len = elen; + } + break; + } + } + break; + + case NODE_ANCHOR: + case NODE_GIMMICK: + default: + break; + } + + return len; +} #define MAX_NODE_OPT_INFO_REF_COUNT 5 @@ -6822,22 +6853,22 @@ optimize_nodes(Node* node, OptNode* opt, OptEnv* env) { OptEnv nenv; - copy_opt_env(&nenv, env); - r = optimize_nodes(NODE_BAG_BODY(en), &xo, &nenv); - if (r == 0) { - mml_add(&nenv.mm, &xo.len); - concat_left_node_opt_info(enc, opt, &xo); - if (IS_NOT_NULL(en->te.Then)) { - r = optimize_nodes(en->te.Then, &xo, &nenv); - if (r == 0) { - concat_left_node_opt_info(enc, opt, &xo); + if (IS_NOT_NULL(en->te.Else)) { + copy_opt_env(&nenv, env); + r = optimize_nodes(NODE_BAG_BODY(en), &xo, &nenv); + if (r == 0) { + mml_add(&nenv.mm, &xo.len); + concat_left_node_opt_info(enc, opt, &xo); + if (IS_NOT_NULL(en->te.Then)) { + r = optimize_nodes(en->te.Then, &xo, &nenv); + if (r == 0) { + concat_left_node_opt_info(enc, opt, &xo); + } } - } - if (IS_NOT_NULL(en->te.Else)) { - r = optimize_nodes(en->te.Else, &xo, env); - if (r == 0) - alt_merge_node_opt_info(opt, &xo, env); + r = optimize_nodes(en->te.Else, &xo, env); + if (r == 0) + alt_merge_node_opt_info(opt, &xo, env); } } } @@ -6930,7 +6961,7 @@ static void print_optimize_info(FILE* f, regex_t* reg); #endif static int -set_optimize_info_from_tree(Node* node, regex_t* reg, ScanEnv* scan_env) +set_optimize_info_from_tree(Node* node, regex_t* reg, ParseEnv* scan_env) { int r; OptNode opt; @@ -6985,6 +7016,7 @@ set_optimize_info_from_tree(Node* node, regex_t* reg, ScanEnv* scan_env) #endif return r; } +#endif /* ONIG_DONT_OPTIMIZE */ static void clear_optimize_info(regex_t* reg) @@ -7031,14 +7063,43 @@ static void print_enc_string(FILE* fp, OnigEncoding enc, s++; } } +} - fprintf(fp, "/\n"); +static void +print_options(FILE* fp, OnigOptionType o) +{ + if ((o & ONIG_OPTION_IGNORECASE) != 0) fprintf(fp, " IGNORECASE"); + if ((o & ONIG_OPTION_EXTEND) != 0) fprintf(fp, " EXTEND"); + if ((o & ONIG_OPTION_MULTILINE) != 0) fprintf(fp, " MULTILINE"); + if ((o & ONIG_OPTION_SINGLELINE) != 0) fprintf(fp, " SINGLELINE"); + if ((o & ONIG_OPTION_FIND_LONGEST) != 0) fprintf(fp, " FIND_LONGEST"); + if ((o & ONIG_OPTION_FIND_NOT_EMPTY) != 0) fprintf(fp, " FIND_NOT_EMPTY"); + if ((o & ONIG_OPTION_NEGATE_SINGLELINE) != 0) fprintf(fp, " NEGATE_SINGLELINE"); + if ((o & ONIG_OPTION_DONT_CAPTURE_GROUP) != 0) fprintf(fp, " DONT_CAPTURE_GROUP"); + if ((o & ONIG_OPTION_CAPTURE_GROUP) != 0) fprintf(fp, " CAPTURE_GROUP"); + if ((o & ONIG_OPTION_NOTBOL) != 0) fprintf(fp, " NOTBOL"); + if ((o & ONIG_OPTION_NOTEOL) != 0) fprintf(fp, " NOTEOL"); + if ((o & ONIG_OPTION_POSIX_REGION) != 0) fprintf(fp, " POSIX_REGION"); + if ((o & ONIG_OPTION_CHECK_VALIDITY_OF_STRING) != 0) fprintf(fp, " CHECK_VALIDITY_OF_STRING"); + if ((o & ONIG_OPTION_IGNORECASE_IS_ASCII) != 0) fprintf(fp, " IGNORECASE_IS_ASCII"); + if ((o & ONIG_OPTION_WORD_IS_ASCII) != 0) fprintf(fp, " WORD_IS_ASCII"); + if ((o & ONIG_OPTION_DIGIT_IS_ASCII) != 0) fprintf(fp, " DIGIT_IS_ASCII"); + if ((o & ONIG_OPTION_SPACE_IS_ASCII) != 0) fprintf(fp, " SPACE_IS_ASCII"); + if ((o & ONIG_OPTION_POSIX_IS_ASCII) != 0) fprintf(fp, " POSIX_IS_ASCII"); + if ((o & ONIG_OPTION_TEXT_SEGMENT_EXTENDED_GRAPHEME_CLUSTER) != 0) fprintf(fp, " TEXT_SEGMENT_EXTENDED_GRAPHEME_CLUSTER"); + if ((o & ONIG_OPTION_TEXT_SEGMENT_WORD) != 0) fprintf(fp, " TEXT_SEGMENT_WORD"); + if ((o & ONIG_OPTION_NOT_BEGIN_STRING) != 0) fprintf(fp, " NOT_BIGIN_STRING"); + if ((o & ONIG_OPTION_NOT_END_STRING) != 0) fprintf(fp, " NOT_END_STRING"); + if ((o & ONIG_OPTION_NOT_BEGIN_POSITION) != 0) fprintf(fp, " NOT_BEGIN_POSITION"); + if ((o & ONIG_OPTION_CALLBACK_EACH_MATCH) != 0) fprintf(fp, " CALLBACK_EACH_MATCH"); } #endif /* ONIG_DEBUG */ #if defined(ONIG_DEBUG_COMPILE) || defined(ONIG_DEBUG_MATCH) +#ifndef ONIG_DONT_OPTIMIZE + static void print_distance_range(FILE* f, OnigLen a, OnigLen b) { @@ -7161,7 +7222,8 @@ print_optimize_info(FILE* f, regex_t* reg) } } } -#endif +#endif /* ONIG_DONT_OPTIMIZE */ +#endif /* defined(ONIG_DEBUG_COMPILE) || defined(ONIG_DEBUG_MATCH) */ extern RegexExt* @@ -7259,93 +7321,150 @@ static void print_tree P_((FILE* f, Node* node)); extern int onig_init_for_match_at(regex_t* reg); -extern int -onig_compile(regex_t* reg, const UChar* pattern, const UChar* pattern_end, - OnigErrorInfo* einfo) -{ - int r; - Node* root; - ScanEnv scan_env; +static int parse_and_tune(regex_t* reg, const UChar* pattern, + const UChar* pattern_end, ParseEnv *scan_env, Node** rroot, + OnigErrorInfo* einfo #ifdef USE_CALL - UnsetAddrList uslist = {0}; + , UnsetAddrList* uslist #endif +) +{ + int r; + Node* root; - root = 0; + root = NULL_NODE; if (IS_NOT_NULL(einfo)) { einfo->enc = reg->enc; einfo->par = (UChar* )NULL; } -#ifdef ONIG_DEBUG - fprintf(DBGFP, "\nPATTERN: /"); - print_enc_string(DBGFP, reg->enc, pattern, pattern_end); -#endif - - if (reg->ops_alloc == 0) { - r = ops_init(reg, OPS_INIT_SIZE); - if (r != 0) goto end; - } - else - reg->ops_used = 0; - - r = onig_parse_tree(&root, pattern, pattern_end, reg, &scan_env); + r = onig_parse_tree(&root, pattern, pattern_end, reg, scan_env); if (r != 0) goto err; r = reduce_string_list(root, reg->enc); if (r != 0) goto err; /* mixed use named group and no-named group */ - if (scan_env.num_named > 0 && - IS_SYNTAX_BV(scan_env.syntax, ONIG_SYN_CAPTURE_ONLY_NAMED_GROUP) && + if (scan_env->num_named > 0 && + IS_SYNTAX_BV(scan_env->syntax, ONIG_SYN_CAPTURE_ONLY_NAMED_GROUP) && ! OPTON_CAPTURE_GROUP(reg->options)) { - if (scan_env.num_named != scan_env.num_mem) - r = disable_noname_group_capture(&root, reg, &scan_env); + if (scan_env->num_named != scan_env->num_mem) + r = disable_noname_group_capture(&root, reg, scan_env); else r = numbered_ref_check(root); if (r != 0) goto err; } - r = check_backrefs(root, &scan_env); + r = check_backrefs(root, scan_env); if (r != 0) goto err; #ifdef USE_CALL - if (scan_env.num_call > 0) { - r = unset_addr_list_init(&uslist, scan_env.num_call); + if (scan_env->num_call > 0) { + r = unset_addr_list_init(uslist, scan_env->num_call); if (r != 0) goto err; - scan_env.unset_addr_list = &uslist; - r = tune_call(root, &scan_env, 0); + scan_env->unset_addr_list = uslist; + r = tune_call(root, scan_env, 0); if (r != 0) goto err_unset; r = tune_call2(root); if (r != 0) goto err_unset; - r = recursive_call_check_trav(root, &scan_env, 0); + r = recursive_call_check_trav(root, scan_env, 0); if (r < 0) goto err_unset; - r = infinite_recursive_call_check_trav(root, &scan_env); + r = infinite_recursive_call_check_trav(root, scan_env); if (r != 0) goto err_unset; tune_called_state(root, 0); } - reg->num_call = scan_env.num_call; + reg->num_call = scan_env->num_call; #endif #ifdef ONIG_DEBUG_PARSE - fprintf(DBGFP, "MAX PARSE DEPTH: %d\n", scan_env.max_parse_depth); - fprintf(DBGFP, "TREE (parsed)\n"); - print_tree(DBGFP, root); - fprintf(DBGFP, "\n"); + fprintf(DBGFP, "MAX PARSE DEPTH: %d\n", scan_env->max_parse_depth); #endif - r = tune_tree(root, reg, 0, &scan_env); - if (r != 0) goto err_unset; + r = tune_tree(root, reg, 0, scan_env); + if (r != 0) { +#ifdef ONIG_DEBUG_PARSE + fprintf(DBGFP, "TREE (error in tune)\n"); + print_tree(DBGFP, root); + fprintf(DBGFP, "\n"); +#endif + goto err_unset; + } - if (scan_env.backref_num != 0) { + if (scan_env->backref_num != 0) { set_parent_node_trav(root, NULL_NODE); - r = set_empty_repeat_node_trav(root, NULL_NODE, &scan_env); + r = set_empty_repeat_node_trav(root, NULL_NODE, scan_env); if (r != 0) goto err_unset; - set_empty_status_check_trav(root, &scan_env); + set_empty_status_check_trav(root, scan_env); } + *rroot = root; + return r; + + err_unset: +#ifdef USE_CALL + if (scan_env->num_call > 0) { + unset_addr_list_end(uslist); + } +#endif + err: + if (IS_NOT_NULL(scan_env->error)) { + if (IS_NOT_NULL(einfo)) { + einfo->par = scan_env->error; + einfo->par_end = scan_env->error_end; + } + } + + onig_node_free(root); + if (IS_NOT_NULL(scan_env->mem_env_dynamic)) + xfree(scan_env->mem_env_dynamic); + + *rroot = NULL_NODE; + return r; +} + +extern int +onig_compile(regex_t* reg, const UChar* pattern, const UChar* pattern_end, + OnigErrorInfo* einfo) +{ + int r; + Node* root; + ParseEnv scan_env; +#ifdef USE_CALL + UnsetAddrList uslist = {0}; +#endif + +#ifdef ONIG_DEBUG + fprintf(DBGFP, "\nPATTERN: /"); + print_enc_string(DBGFP, reg->enc, pattern, pattern_end); + fprintf(DBGFP, "/\n"); + fprintf(DBGFP, "OPTIONS:"); + print_options(DBGFP, reg->options); + fprintf(DBGFP, "\n"); +#endif + + if (reg->ops_alloc == 0) { + r = ops_init(reg, OPS_INIT_SIZE); + if (r != 0) { + if (IS_NOT_NULL(einfo)) { + einfo->enc = reg->enc; + einfo->par = (UChar* )NULL; + } + return r; + } + } + else + reg->ops_used = 0; + + r = parse_and_tune(reg, pattern, pattern_end, &scan_env, &root, einfo +#ifdef USE_CALL + , &uslist +#endif + ); + if (r != 0) return r; + #ifdef ONIG_DEBUG_PARSE fprintf(DBGFP, "TREE (after tune)\n"); print_tree(DBGFP, root); @@ -7377,7 +7496,14 @@ onig_compile(regex_t* reg, const UChar* pattern, const UChar* pattern_end, clear_optimize_info(reg); #ifndef ONIG_DONT_OPTIMIZE r = set_optimize_info_from_tree(root, reg, &scan_env); - if (r != 0) goto err_unset; + if (r != 0) { +#ifdef USE_CALL + if (scan_env.num_call > 0) { + unset_addr_list_end(&uslist); + } +#endif + goto err; + } #endif if (IS_NOT_NULL(scan_env.mem_env_dynamic)) { @@ -7407,6 +7533,9 @@ onig_compile(regex_t* reg, const UChar* pattern, const UChar* pattern_end, } #endif + r = ops_resize(reg, reg->ops_used); + if (r != ONIG_NORMAL) goto err; + set_addr_in_repeat_range(reg); if ((reg->push_mem_end != 0) @@ -7449,15 +7578,8 @@ onig_compile(regex_t* reg, const UChar* pattern, const UChar* pattern_end, onig_init_for_match_at(reg); #endif - end: return r; - err_unset: -#ifdef USE_CALL - if (scan_env.num_call > 0) { - unset_addr_list_end(&uslist); - } -#endif err: if (IS_NOT_NULL(scan_env.error)) { if (IS_NOT_NULL(einfo)) { @@ -7513,6 +7635,12 @@ onig_reg_init(regex_t* reg, OnigOptionType option, OnigCaseFoldType case_fold_fl else option |= syntax->options; + if ((option & ONIG_OPTION_IGNORECASE_IS_ASCII) != 0) { + case_fold_flag &= ~(INTERNAL_ONIGENC_CASE_FOLD_MULTI_CHAR | + ONIGENC_CASE_FOLD_TURKISH_AZERI); + case_fold_flag |= ONIGENC_CASE_FOLD_ASCII_ONLY; + } + (reg)->enc = enc; (reg)->options = option; (reg)->syntax = syntax; @@ -7703,15 +7831,145 @@ onig_is_code_in_cc(OnigEncoding enc, OnigCodePoint code, CClassNode* cc) return onig_is_code_in_cc_len(len, code, cc); } + +#define MANY_REPEAT_OF_ANYCHAR 20 + +typedef enum { + MJ_NO = 0, + MJ_YES = 1, + MJ_IGNORE = 2, +} MJ_RESULT; + +static MJ_RESULT +mostly_just_anychar(Node* node, int in_reluctant) +{ + MJ_RESULT r; + + r = MJ_NO; + switch (NODE_TYPE(node)) { + case NODE_LIST: + { + int found = FALSE; + do { + r = mostly_just_anychar(NODE_CAR(node), in_reluctant); + if (r == MJ_NO) break; + if (r == MJ_YES) found = TRUE; + } while (IS_NOT_NULL(node = NODE_CDR(node))); + if (r == MJ_IGNORE) { + if (found == TRUE) r = MJ_YES; + } + } + break; + + case NODE_ALT: + r = MJ_IGNORE; + do { + r = mostly_just_anychar(NODE_CAR(node), in_reluctant); + if (r == MJ_YES) break; + } while (IS_NOT_NULL(node = NODE_CDR(node))); + break; + + case NODE_QUANT: + { + QuantNode* qn = QUANT_(node); + + if (qn->upper == 0) + r = MJ_IGNORE; + else { + if (in_reluctant == FALSE) { + if (qn->greedy != 0 && + (! IS_INFINITE_REPEAT(qn->upper) && + qn->upper <= MANY_REPEAT_OF_ANYCHAR)) { + in_reluctant = TRUE; + } + } + r = mostly_just_anychar(NODE_BODY(node), in_reluctant); + } + } + break; + + case NODE_ANCHOR: + switch (ANCHOR_(node)->type) { + case ANCR_PREC_READ: + case ANCR_PREC_READ_NOT: + case ANCR_LOOK_BEHIND: + case ANCR_LOOK_BEHIND_NOT: + case ANCR_TEXT_SEGMENT_BOUNDARY: /* \y */ + r = MJ_IGNORE; + break; + default: + break; + } + break; + + case NODE_BAG: + { + BagNode* en = BAG_(node); + + if (en->type == BAG_IF_ELSE) { + if (IS_NOT_NULL(en->te.Then)) { + r = mostly_just_anychar(en->te.Then, in_reluctant); + if (r == MJ_YES) break; + } + if (IS_NOT_NULL(en->te.Else)) { + r = mostly_just_anychar(en->te.Else, in_reluctant); + } + } + else { + r = mostly_just_anychar(NODE_BODY(node), in_reluctant); + } + } + break; + + case NODE_CTYPE: + if (CTYPE_(node)->ctype == CTYPE_ANYCHAR) + r = MJ_YES; + else + r = MJ_NO; + break; + + case NODE_STRING: + if (NODE_STRING_LEN(node) == 0) { + r = MJ_IGNORE; + break; + } + /* fall */ + case NODE_CCLASS: + r = MJ_NO; + break; + +#ifdef USE_CALL + case NODE_CALL: + /* ignore call */ +#endif + case NODE_BACKREF: + case NODE_GIMMICK: + r = MJ_IGNORE; + break; + + default: + break; + } + + return r; +} + +#define MAX_CALLS_IN_DETECT 10 + typedef struct { int prec_read; int look_behind; + int backref; int backref_with_level; int call; + int anychar_reluctant_many; + int empty_check_nest_level; + int max_empty_check_nest_level; + int heavy_element; } SlowElementCount; static int -node_detect_can_be_slow(Node* node, SlowElementCount* ct) +detect_can_be_slow(Node* node, SlowElementCount* ct, int ncall, int calls[]) { int r; @@ -7720,13 +7978,45 @@ node_detect_can_be_slow(Node* node, SlowElementCount* ct) case NODE_LIST: case NODE_ALT: do { - r = node_detect_can_be_slow(NODE_CAR(node), ct); + r = detect_can_be_slow(NODE_CAR(node), ct, ncall, calls); if (r != 0) return r; } while (IS_NOT_NULL(node = NODE_CDR(node))); break; case NODE_QUANT: - r = node_detect_can_be_slow(NODE_BODY(node), ct); + { + int prev_heavy_element; + QuantNode* qn; + Node* body; + + qn = QUANT_(node); + body = NODE_BODY(node); + + if (qn->emptiness != BODY_IS_NOT_EMPTY) { + prev_heavy_element = ct->heavy_element; + ct->empty_check_nest_level++; + if (ct->empty_check_nest_level > ct->max_empty_check_nest_level) + ct->max_empty_check_nest_level = ct->empty_check_nest_level; + } + else if (IS_INFINITE_REPEAT(qn->upper) || + qn->upper > MANY_REPEAT_OF_ANYCHAR) { + MJ_RESULT mr = mostly_just_anychar(body, (qn->greedy == 0)); + if (mr == MJ_YES) + ct->anychar_reluctant_many++; + } + + r = detect_can_be_slow(body, ct, ncall, calls); + + if (qn->emptiness != BODY_IS_NOT_EMPTY) { + if (NODE_IS_INPEEK(node)) { + if (ct->empty_check_nest_level > 2) { + if (prev_heavy_element == ct->heavy_element) + ct->heavy_element++; + } + } + ct->empty_check_nest_level--; + } + } break; case NODE_ANCHOR: @@ -7744,23 +8034,23 @@ node_detect_can_be_slow(Node* node, SlowElementCount* ct) } if (ANCHOR_HAS_BODY(ANCHOR_(node))) - r = node_detect_can_be_slow(NODE_BODY(node), ct); + r = detect_can_be_slow(NODE_BODY(node), ct, ncall, calls); break; case NODE_BAG: { BagNode* en = BAG_(node); - r = node_detect_can_be_slow(NODE_BODY(node), ct); + r = detect_can_be_slow(NODE_BODY(node), ct, ncall, calls); if (r != 0) return r; if (en->type == BAG_IF_ELSE) { if (IS_NOT_NULL(en->te.Then)) { - r = node_detect_can_be_slow(en->te.Then, ct); + r = detect_can_be_slow(en->te.Then, ct, ncall, calls); if (r != 0) return r; } if (IS_NOT_NULL(en->te.Else)) { - r = node_detect_can_be_slow(en->te.Else, ct); + r = detect_can_be_slow(en->te.Else, ct, ncall, calls); if (r != 0) return r; } } @@ -7771,12 +8061,44 @@ node_detect_can_be_slow(Node* node, SlowElementCount* ct) case NODE_BACKREF: if (NODE_IS_NEST_LEVEL(node)) ct->backref_with_level++; + else + ct->backref++; break; #endif #ifdef USE_CALL case NODE_CALL: - ct->call++; + { + int i; + int found; + int gnum; + + gnum = CALL_(node)->called_gnum; + ct->call++; + + if (NODE_IS_RECURSION(node) && NODE_IS_INPEEK(node) && + NODE_IS_IN_REAL_REPEAT(node)) { + ct->heavy_element += 10; + } + + found = FALSE; + for (i = 0; i < ncall; i++) { + if (gnum == calls[i]) { + found = TRUE; + break; + } + } + + if (! found) { + if (ncall + 1 < MAX_CALLS_IN_DETECT) { + calls[ncall] = gnum; + r = detect_can_be_slow(NODE_BODY(node), ct, ncall + 1, calls); + } + else { + ct->heavy_element++; + } + } + } break; #endif @@ -7795,8 +8117,12 @@ onig_detect_can_be_slow_pattern(const UChar* pattern, int r; regex_t* reg; Node* root; - ScanEnv scan_env; + ParseEnv scan_env; SlowElementCount count; + int calls[MAX_CALLS_IN_DETECT]; +#ifdef USE_CALL + UnsetAddrList uslist = {0}; +#endif reg = (regex_t* )xmalloc(sizeof(regex_t)); if (IS_NULL(reg)) return ONIGERR_MEMORY; @@ -7807,25 +8133,44 @@ onig_detect_can_be_slow_pattern(const UChar* pattern, return r; } - root = 0; - r = onig_parse_tree(&root, pattern, pattern_end, reg, &scan_env); + r = parse_and_tune(reg, pattern, pattern_end, &scan_env, &root, NULL +#ifdef USE_CALL + , &uslist +#endif + ); + if (r != 0) goto err; + +#ifdef USE_CALL + if (scan_env.num_call > 0) { + unset_addr_list_end(&uslist); + } +#endif + + count.prec_read = 0; + count.look_behind = 0; + count.backref = 0; + count.backref_with_level = 0; + count.call = 0; + count.anychar_reluctant_many = 0; + count.empty_check_nest_level = 0; + count.max_empty_check_nest_level = 0; + count.heavy_element = 0; + + r = detect_can_be_slow(root, &count, 0, calls); if (r == 0) { - count.prec_read = 0; - count.look_behind = 0; - count.backref_with_level = 0; - count.call = 0; - - r = node_detect_can_be_slow(root, &count); - if (r == 0) { - int n = count.prec_read + count.look_behind - + count.backref_with_level + count.call; - r = n; - } + int n = count.prec_read + count.look_behind + + count.backref + count.backref_with_level + count.call + + count.anychar_reluctant_many; + if (count.heavy_element != 0) + n += count.heavy_element * 10; + + r = n; } if (IS_NOT_NULL(scan_env.mem_env_dynamic)) xfree(scan_env.mem_env_dynamic); + err: onig_node_free(root); onig_free(reg); return r; @@ -7853,6 +8198,8 @@ Indent(FILE* f, int indent) static void print_indent_tree(FILE* f, Node* node, int indent) { + static char* emptiness_name[] = { "", " empty", " empty_mem", " empty_rec" }; + int i; NodeType type; UChar* p; @@ -8019,69 +8366,83 @@ print_indent_tree(FILE* f, Node* node, int indent) fprintf(f, "<call:%p>", node); fprintf(f, " num: %d, name", cn->called_gnum); p_string(f, cn->name_end - cn->name, cn->name); + if (NODE_IS_RECURSION(node)) fprintf(f, ", recursion"); + if (NODE_IS_INPEEK(node)) fprintf(f, ", in-peek"); + if (NODE_IS_IN_REAL_REPEAT(node)) fprintf(f, ", in-real-repeat"); } break; #endif case NODE_QUANT: - fprintf(f, "<quantifier:%p>{%d,%d}%s%s\n", node, - QUANT_(node)->lower, QUANT_(node)->upper, - (QUANT_(node)->greedy ? "" : "?"), - QUANT_(node)->include_referred == 0 ? "" : " referred"); - print_indent_tree(f, NODE_BODY(node), indent + add); + { + fprintf(f, "<quantifier:%p>{%d,%d}%s%s%s", node, + QUANT_(node)->lower, QUANT_(node)->upper, + (QUANT_(node)->greedy ? "" : "?"), + QUANT_(node)->include_referred == 0 ? "" : " referred", + emptiness_name[QUANT_(node)->emptiness]); + if (NODE_IS_INPEEK(node)) fprintf(f, ", in-peek"); + fprintf(f, "\n"); + print_indent_tree(f, NODE_BODY(node), indent + add); + } break; case NODE_BAG: - fprintf(f, "<bag:%p> ", node); - if (BAG_(node)->type == BAG_IF_ELSE) { - Node* Then; - Node* Else; - BagNode* bn; - - bn = BAG_(node); - fprintf(f, "if-else\n"); - print_indent_tree(f, NODE_BODY(node), indent + add); + { + BagNode* bn = BAG_(node); + fprintf(f, "<bag:%p> ", node); + if (bn->type == BAG_IF_ELSE) { + Node* Then; + Node* Else; + + fprintf(f, "if-else\n"); + print_indent_tree(f, NODE_BODY(node), indent + add); + + Then = bn->te.Then; + Else = bn->te.Else; + if (IS_NULL(Then)) { + Indent(f, indent + add); + fprintf(f, "THEN empty\n"); + } + else + print_indent_tree(f, Then, indent + add); - Then = bn->te.Then; - Else = bn->te.Else; - if (IS_NULL(Then)) { - Indent(f, indent + add); - fprintf(f, "THEN empty\n"); + if (IS_NULL(Else)) { + Indent(f, indent + add); + fprintf(f, "ELSE empty\n"); + } + else + print_indent_tree(f, Else, indent + add); } - else - print_indent_tree(f, Then, indent + add); + else { + switch (bn->type) { + case BAG_OPTION: + fprintf(f, "option:%d", bn->o.options); + break; + case BAG_MEMORY: + fprintf(f, "memory:%d", bn->m.regnum); + if (NODE_IS_CALLED(node)) { + fprintf(f, ", called"); + if (NODE_IS_RECURSION(node)) + fprintf(f, ", recursion"); + } + else if (NODE_IS_REFERENCED(node)) + fprintf(f, ", referenced"); - if (IS_NULL(Else)) { - Indent(f, indent + add); - fprintf(f, "ELSE empty\n"); + if (NODE_IS_FIXED_ADDR(node)) + fprintf(f, ", fixed-addr"); + if ((bn->m.called_state & IN_PEEK) != 0) + fprintf(f, ", in-peek"); + break; + case BAG_STOP_BACKTRACK: + fprintf(f, "stop-bt"); + break; + default: + break; + } + fprintf(f, "\n"); + print_indent_tree(f, NODE_BODY(node), indent + add); } - else - print_indent_tree(f, Else, indent + add); - - break; } - - switch (BAG_(node)->type) { - case BAG_OPTION: - fprintf(f, "option:%d", BAG_(node)->o.options); - break; - case BAG_MEMORY: - fprintf(f, "memory:%d", BAG_(node)->m.regnum); - if (NODE_IS_CALLED(node)) - fprintf(f, ", called"); - else if (NODE_IS_REFERENCED(node)) - fprintf(f, ", referenced"); - if (NODE_IS_FIXED_ADDR(node)) - fprintf(f, ", fixed-addr"); - break; - case BAG_STOP_BACKTRACK: - fprintf(f, "stop-bt"); - break; - default: - break; - } - fprintf(f, "\n"); - print_indent_tree(f, NODE_BODY(node), indent + add); break; case NODE_GIMMICK: diff --git a/src/regenc.c b/src/regenc.c index 27e4549..84afd1e 100644 --- a/src/regenc.c +++ b/src/regenc.c @@ -2,7 +2,7 @@ regenc.c - Oniguruma (regular expression library) **********************************************************************/ /*- - * Copyright (c) 2002-2019 K.Kosako + * Copyright (c) 2002-2020 K.Kosako * All rights reserved. * * Redistribution and use in source and binary forms, with or without @@ -569,6 +569,9 @@ onigenc_apply_all_case_fold_with_map(int map_size, r = onigenc_ascii_apply_all_case_fold(flag, f, arg); if (r != 0) return r; + if (CASE_FOLD_IS_ASCII_ONLY(flag)) + return 0; + for (i = 0; i < map_size; i++) { code = map[i].to; r = (*f)(map[i].from, &code, 1, arg); @@ -588,7 +591,7 @@ onigenc_apply_all_case_fold_with_map(int map_size, extern int onigenc_get_case_fold_codes_by_str_with_map(int map_size, const OnigPairCaseFoldCodes map[], - int ess_tsett_flag, OnigCaseFoldType flag ARG_UNUSED, + int ess_tsett_flag, OnigCaseFoldType flag, const OnigUChar* p, const OnigUChar* end, OnigCaseFoldCodeItem items[]) { int i, j, n; @@ -596,7 +599,8 @@ onigenc_get_case_fold_codes_by_str_with_map(int map_size, if (0x41 <= *p && *p <= 0x5a) { /* A - Z */ if (*p == LARGE_S && ess_tsett_flag != 0 && end > p + 1 - && (*(p+1) == LARGE_S || *(p+1) == SMALL_S)) { /* SS */ + && (*(p+1) == LARGE_S || *(p+1) == SMALL_S) /* SS */ + && CASE_FOLD_IS_NOT_ASCII_ONLY(flag)) { ss_combination: items[0].byte_len = 2; items[0].code_len = 1; @@ -625,7 +629,8 @@ onigenc_get_case_fold_codes_by_str_with_map(int map_size, } else if (0x61 <= *p && *p <= 0x7a) { /* a - z */ if (*p == SMALL_S && ess_tsett_flag != 0 && end > p + 1 - && (*(p+1) == SMALL_S || *(p+1) == LARGE_S)) { + && (*(p+1) == SMALL_S || *(p+1) == LARGE_S) + && CASE_FOLD_IS_NOT_ASCII_ONLY(flag)) { goto ss_combination; } @@ -634,7 +639,8 @@ onigenc_get_case_fold_codes_by_str_with_map(int map_size, items[0].code[0] = (OnigCodePoint )(*p - 0x20); return 1; } - else if (*p == 0xdf && ess_tsett_flag != 0) { + else if (*p == 0xdf && ess_tsett_flag != 0 + && CASE_FOLD_IS_NOT_ASCII_ONLY(flag)) { items[0].byte_len = 1; items[0].code_len = 2; items[0].code[0] = (OnigCodePoint )'s'; @@ -660,6 +666,9 @@ onigenc_get_case_fold_codes_by_str_with_map(int map_size, else { int i; + if (CASE_FOLD_IS_ASCII_ONLY(flag)) + return 0; + for (i = 0; i < map_size; i++) { if (*p == map[i].from) { items[0].byte_len = 1; diff --git a/src/regenc.h b/src/regenc.h index d183b97..d0b447d 100644 --- a/src/regenc.h +++ b/src/regenc.h @@ -142,6 +142,10 @@ struct PropertyNameCtype { #define ENC_GET_SKIP_OFFSET(enc) \ (((enc)->flag & ENC_FLAG_SKIP_OFFSET_MASK)>>2) +#define CASE_FOLD_IS_ASCII_ONLY(flag) \ + (((flag) & ONIGENC_CASE_FOLD_ASCII_ONLY) != 0) +#define CASE_FOLD_IS_NOT_ASCII_ONLY(flag) \ + (((flag) & ONIGENC_CASE_FOLD_ASCII_ONLY) == 0) /* for encoding system implementation (internal) */ extern int onigenc_end(void); @@ -202,12 +206,12 @@ extern int onigenc_wb_is_break_position P_((OnigEncoding enc, UChar* p, UChar* p #define FOLDS1_UNFOLDS_NUM(i) (OnigUnicodeFolds1[(i)+1]) #define FOLDS2_UNFOLDS_NUM(i) (OnigUnicodeFolds2[(i)+2]) #define FOLDS3_UNFOLDS_NUM(i) (OnigUnicodeFolds3[(i)+3]) -#define FOLDS1_UNFOLDS(i) (OnigUnicodeFolds1 + (i) + 2) -#define FOLDS2_UNFOLDS(i) (OnigUnicodeFolds2 + (i) + 3) -#define FOLDS3_UNFOLDS(i) (OnigUnicodeFolds3 + (i) + 4) -#define FOLDS1_NEXT_INDEX(i) ((i) + 2 + OnigUnicodeFolds1[(i)+1]) -#define FOLDS2_NEXT_INDEX(i) ((i) + 3 + OnigUnicodeFolds2[(i)+2]) -#define FOLDS3_NEXT_INDEX(i) ((i) + 4 + OnigUnicodeFolds3[(i)+3]) +#define FOLDS1_UNFOLDS(i) (FOLDS1_FOLD(i) + 2) +#define FOLDS2_UNFOLDS(i) (FOLDS2_FOLD(i) + 3) +#define FOLDS3_UNFOLDS(i) (FOLDS3_FOLD(i) + 4) +#define FOLDS1_NEXT_INDEX(i) ((i) + 2 + FOLDS1_UNFOLDS_NUM(i)) +#define FOLDS2_NEXT_INDEX(i) ((i) + 3 + FOLDS2_UNFOLDS_NUM(i)) +#define FOLDS3_NEXT_INDEX(i) ((i) + 4 + FOLDS3_UNFOLDS_NUM(i)) #define FOLDS_FOLD_ADDR_BUK(buk, addr) do {\ if ((buk)->fold_len == 1)\ diff --git a/src/regerror.c b/src/regerror.c index dc1c8b6..18a5bdd 100644 --- a/src/regerror.c +++ b/src/regerror.c @@ -2,7 +2,7 @@ regerror.c - Oniguruma (regular expression library) **********************************************************************/ /*- - * Copyright (c) 2002-2020 K.Kosako + * Copyright (c) 2002-2021 K.Kosako * All rights reserved. * * Redistribution and use in source and binary forms, with or without @@ -146,6 +146,8 @@ onig_error_code_to_format(int code) p = "too big wide-char value"; break; case ONIGERR_TOO_LONG_WIDE_CHAR_VALUE: p = "too long wide-char value"; break; + case ONIGERR_UNDEFINED_OPERATOR: + p = "undefined operator"; break; case ONIGERR_INVALID_CODE_POINT_VALUE: p = "invalid code point value"; break; case ONIGERR_EMPTY_GROUP_NAME: @@ -190,6 +192,8 @@ onig_error_code_to_format(int code) p = "not supported encoding combination"; break; case ONIGERR_INVALID_COMBINATION_OF_OPTIONS: p = "invalid combination of options"; break; + case ONIGERR_VERY_INEFFICIENT_PATTERN: + p = "very inefficient pattern"; break; case ONIGERR_LIBRARY_IS_NOT_INITIALIZED: p = "library is not initialized"; break; diff --git a/src/regexec.c b/src/regexec.c index bb6b474..a3cf60a 100644 --- a/src/regexec.c +++ b/src/regexec.c @@ -2,7 +2,7 @@ regexec.c - Oniguruma (regular expression library) **********************************************************************/ /*- - * Copyright (c) 2002-2020 K.Kosako + * Copyright (c) 2002-2021 K.Kosako * All rights reserved. * * Redistribution and use in source and binary forms, with or without @@ -54,6 +54,13 @@ (MEM_STATUS_AT((reg)->push_mem_end, (idx)) != 0 ? \ STACK_AT(mem_end_stk[idx].i)->u.mem.pstr : mem_end_stk[idx].s) +#ifdef _MSC_VER +#define DIST_CAST(d) (size_t )(d) +#else +#define DIST_CAST(d) (d) +#endif + + static int forward_search(regex_t* reg, const UChar* str, const UChar* end, UChar* start, UChar* range, UChar** low, UChar** high); static int @@ -76,11 +83,12 @@ struct OnigMatchParamStruct { unsigned long retry_limit_in_match; unsigned long retry_limit_in_search; #endif + + void* callout_user_data; /* used in callback each match */ #ifdef USE_CALLOUT OnigCalloutFunc progress_callout_of_contents; OnigCalloutFunc retraction_callout_of_contents; int match_at_call_counter; - void* callout_user_data; CalloutData* callout_data; int callout_data_alloc_num; #endif @@ -143,12 +151,8 @@ onig_set_retraction_callout_of_match_param(OnigMatchParam* param, OnigCalloutFun extern int onig_set_callout_user_data_of_match_param(OnigMatchParam* param, void* user_data) { -#ifdef USE_CALLOUT param->callout_user_data = user_data; return ONIG_NORMAL; -#else - return ONIG_NO_SUPPORT_CONFIG; -#endif } @@ -873,6 +877,23 @@ onig_get_capture_tree(OnigRegion* region) } #endif /* USE_CAPTURE_HISTORY */ + +static OnigCallbackEachMatchFunc CallbackEachMatch; + +extern OnigCallbackEachMatchFunc +onig_get_callback_each_match(void) +{ + return CallbackEachMatch; +} + +extern int +onig_set_callback_each_match(OnigCallbackEachMatchFunc f) +{ + CallbackEachMatch = f; + return ONIG_NORMAL; +} + + extern void onig_region_clear(OnigRegion* region) { @@ -1238,7 +1259,7 @@ struct OnigCalloutArgsStruct { #ifdef USE_FIND_LONGEST_SEARCH_ALL_OF_RANGE #define MATCH_ARG_INIT(msa, reg, arg_option, arg_region, arg_start, mpv) do { \ (msa).stack_p = (void* )0;\ - (msa).options = (arg_option);\ + (msa).options = (arg_option)|(reg)->options;\ (msa).region = (arg_region);\ (msa).start = (arg_start);\ (msa).match_stack_limit = (mpv)->match_stack_limit;\ @@ -1251,7 +1272,7 @@ struct OnigCalloutArgsStruct { #else #define MATCH_ARG_INIT(msa, reg, arg_option, arg_region, arg_start, mpv) do { \ (msa).stack_p = (void* )0;\ - (msa).options = (arg_option);\ + (msa).options = (arg_option)|(reg)->options;\ (msa).region = (arg_region);\ (msa).start = (arg_start);\ (msa).match_stack_limit = (mpv)->match_stack_limit;\ @@ -1405,6 +1426,7 @@ onig_set_subexp_call_limit_in_search(unsigned long n) #endif + #ifdef USE_CALLOUT static OnigCalloutFunc DefaultProgressCallout; static OnigCalloutFunc DefaultRetractionCallout; @@ -1452,11 +1474,12 @@ onig_initialize_match_param(OnigMatchParam* mp) mp->retry_limit_in_search = RetryLimitInSearch; #endif + mp->callout_user_data = 0; + #ifdef USE_CALLOUT mp->progress_callout_of_contents = DefaultProgressCallout; mp->retraction_callout_of_contents = DefaultRetractionCallout; mp->match_at_call_counter = 0; - mp->callout_user_data = 0; mp->callout_data = 0; mp->callout_data_alloc_num = 0; #endif @@ -1532,13 +1555,26 @@ onig_get_callout_data_dont_clear_old(regex_t* reg, OnigMatchParam* mp, t = d->slot[slot].type; if (IS_NOT_NULL(type)) *type = t; if (IS_NOT_NULL(val)) *val = d->slot[slot].val; - return (t == ONIG_TYPE_VOID ? 1 : ONIG_NORMAL); + return (t == ONIG_TYPE_VOID ? ONIG_VALUE_IS_NOT_SET : ONIG_NORMAL); +} + +extern int +onig_get_callout_data_by_tag_dont_clear_old(regex_t* reg, + OnigMatchParam* mp, const UChar* tag, const UChar* tag_end, int slot, + OnigType* type, OnigValue* val) +{ + int num; + + num = onig_get_callout_num_by_tag(reg, tag, tag_end); + if (num < 0) return num; + if (num == 0) return ONIGERR_INVALID_CALLOUT_TAG_NAME; + + return onig_get_callout_data_dont_clear_old(reg, mp, num, slot, type, val); } extern int -onig_get_callout_data_by_callout_args_self_dont_clear_old(OnigCalloutArgs* args, - int slot, OnigType* type, - OnigValue* val) +onig_get_callout_data_by_callout_args_self_dont_clear_old( + OnigCalloutArgs* args, int slot, OnigType* type, OnigValue* val) { return onig_get_callout_data_dont_clear_old(args->regex, args->msa->mp, args->num, slot, type, val); @@ -1563,7 +1599,7 @@ onig_get_callout_data(regex_t* reg, OnigMatchParam* mp, t = d->slot[slot].type; if (IS_NOT_NULL(type)) *type = t; if (IS_NOT_NULL(val)) *val = d->slot[slot].val; - return (t == ONIG_TYPE_VOID ? 1 : ONIG_NORMAL); + return (t == ONIG_TYPE_VOID ? ONIG_VALUE_IS_NOT_SET : ONIG_NORMAL); } extern int @@ -2171,65 +2207,90 @@ stack_double(int* is_alloca, char** arg_alloc_base, }\ } while (0) -#ifdef USE_STUBBORN_CHECK_CAPTURES_IN_EMPTY_REPEAT -#define STACK_EMPTY_CHECK_MEM(isnull, sid, s, reg) do {\ - StackType* k;\ - GET_EMPTY_CHECK_START(sid, k);\ - if (k->u.empty_check.pstr != (s)) {\ +#ifdef USE_RIGID_CHECK_CAPTURES_IN_EMPTY_REPEAT +#define STACK_EMPTY_CHECK_MEM(isnull, sid, empty_status_mem, s, reg) do {\ + StackType* klow;\ + GET_EMPTY_CHECK_START(sid, klow);\ + if (klow->u.empty_check.pstr != (s)) {\ + stack_empty_check_mem_not_empty:\ (isnull) = 0;\ }\ else {\ - UChar* endp;\ + StackType *k, *kk;\ + MemStatusType ms = (empty_status_mem);\ (isnull) = 1;\ - while (k < stk) {\ - if (k->type == STK_MEM_START &&\ - MEM_STATUS_LIMIT_AT((reg)->empty_status_mem, k->zid)) {\ - STACK_MEM_START_GET_PREV_END_ADDR(k, reg, endp);\ - if (endp == 0) {\ - (isnull) = 0; break;\ - }\ - else if (STACK_AT(k->u.mem.prev_start.i)->u.mem.pstr != endp) {\ - (isnull) = 0; break;\ - }\ - else if (endp != s) {\ - (isnull) = -1; /* empty, but position changed */ \ + k = stk;\ + while (k > klow) {\ + k--;\ + if (k->type == STK_MEM_END && MEM_STATUS_LIMIT_AT(ms, k->zid)) {\ + kk = klow;\ + while (kk < k) {\ + if (kk->type == STK_MEM_START && kk->zid == k->zid) {\ + if (kk->u.mem.prev_end.i == INVALID_STACK_INDEX || \ + ((STACK_AT(kk->u.mem.prev_end.i)->u.mem.pstr != k->u.mem.pstr || STACK_AT(kk->u.mem.prev_start.i)->u.mem.pstr != STACK_AT(k->u.mem.prev_start.i)->u.mem.pstr) && (STACK_AT(k->u.mem.prev_start.i)->u.mem.pstr != k->u.mem.pstr || STACK_AT(kk->u.mem.prev_start.i)->u.mem.pstr != STACK_AT(kk->u.mem.prev_end.i)->u.mem.pstr))) {\ + goto stack_empty_check_mem_not_empty;\ + }\ + else {\ + ms &= ~((MemStatusType )1 << k->zid);\ + break;\ + }\ + }\ + kk++;\ }\ + if (ms == 0) break;\ }\ - k++;\ }\ }\ } while(0) -#define STACK_EMPTY_CHECK_MEM_REC(isnull,sid,s,reg) do {\ +#define STACK_EMPTY_CHECK_MEM_REC(isnull,sid,empty_status_mem,s,reg) do {\ int level = 0;\ - StackType* k = stk;\ + StackType* klow = stk;\ while (1) {\ - k--;\ - STACK_BASE_CHECK(k, "STACK_EMPTY_CHECK_MEM_REC");\ - if (k->type == STK_EMPTY_CHECK_START) {\ - if (k->zid == (sid)) {\ + klow--;\ + STACK_BASE_CHECK(klow, "STACK_EMPTY_CHECK_MEM_REC");\ + if (klow->type == STK_EMPTY_CHECK_START) {\ + if (klow->zid == (sid)) {\ if (level == 0) {\ - if (k->u.empty_check.pstr != (s)) {\ + if (klow->u.empty_check.pstr != (s)) {\ + stack_empty_check_mem_rec_not_empty:\ (isnull) = 0;\ break;\ }\ else {\ - UChar* endp;\ + StackType *k, *kk;\ + MemStatusType ms;\ (isnull) = 1;\ - while (k < stk) {\ - if (k->type == STK_MEM_START) {\ - if (level == 0 && \ - MEM_STATUS_LIMIT_AT((reg)->empty_status_mem, k->zid) !=0) {\ - STACK_MEM_START_GET_PREV_END_ADDR(k, reg, endp);\ - if (endp == 0) {\ - (isnull) = 0; break;\ - }\ - else if (STACK_AT(k->u.mem.prev_start.i)->u.mem.pstr != endp) { \ - (isnull) = 0; break;\ - }\ - else if (endp != s) {\ - (isnull) = -1; /* empty, but position changed */\ + if ((empty_status_mem) == 0) break;\ + ms = (empty_status_mem);\ + k = stk;\ + while (k > klow) {\ + k--;\ + if (k->type == STK_MEM_END) {\ + if (level == 0 && MEM_STATUS_LIMIT_AT(ms, k->zid)) {\ + kk = klow;\ + kk++;\ + while (kk < k) {\ + if (kk->type == STK_MEM_START && kk->zid == k->zid) {\ + if (kk->u.mem.prev_end.i == INVALID_STACK_INDEX || \ + ((STACK_AT(kk->u.mem.prev_end.i)->u.mem.pstr != k->u.mem.pstr || STACK_AT(kk->u.mem.prev_start.i)->u.mem.pstr != STACK_AT(k->u.mem.prev_start.i)->u.mem.pstr) && (STACK_AT(k->u.mem.prev_start.i)->u.mem.pstr != k->u.mem.pstr || STACK_AT(kk->u.mem.prev_start.i)->u.mem.pstr != STACK_AT(kk->u.mem.prev_end.i)->u.mem.pstr))) {\ + goto stack_empty_check_mem_rec_not_empty;\ + }\ + else {\ + ms &= ~((MemStatusType )1 << k->zid);\ + break;\ + }\ + }\ + else if (kk->type == STK_EMPTY_CHECK_START) {\ + if (kk->zid == (sid)) level++;\ + }\ + else if (kk->type == STK_EMPTY_CHECK_END) {\ + if (kk->zid == (sid)) level--;\ + }\ + kk++;\ }\ + level = 0;\ + if (ms == 0) break;\ }\ }\ else if (k->type == STK_EMPTY_CHECK_START) {\ @@ -2238,7 +2299,6 @@ stack_double(int* is_alloca, char** arg_alloc_base, else if (k->type == STK_EMPTY_CHECK_END) {\ if (k->zid == (sid)) level--;\ }\ - k++;\ }\ break;\ }\ @@ -2248,8 +2308,8 @@ stack_double(int* is_alloca, char** arg_alloc_base, }\ }\ }\ - else if (k->type == STK_EMPTY_CHECK_END) {\ - if (k->zid == (sid)) level++;\ + else if (klow->type == STK_EMPTY_CHECK_END) {\ + if (klow->zid == (sid)) level++;\ }\ }\ } while(0) @@ -2274,7 +2334,7 @@ stack_double(int* is_alloca, char** arg_alloc_base, }\ }\ } while(0) -#endif /* USE_STUBBORN_CHECK_CAPTURES_IN_EMPTY_REPEAT */ +#endif /* USE_RIGID_CHECK_CAPTURES_IN_EMPTY_REPEAT */ #define STACK_GET_REPEAT_COUNT_SEARCH(sid, c) do {\ StackType* k = stk;\ @@ -2888,6 +2948,7 @@ match_at(regex_t* reg, const UChar* str, const UChar* end, StackType *stkp; /* used as any purpose. */ StkPtrType *mem_start_stk, *mem_end_stk; UChar* keep; + OnigRegion* region; #ifdef USE_REPEAT_AND_EMPTY_CHECK_LOCAL_VAR StackIndex *repeat_stk; @@ -2905,8 +2966,8 @@ match_at(regex_t* reg, const UChar* str, const UChar* end, unsigned long subexp_call_counters[MAX_SUBEXP_CALL_COUNTERS]; #endif + OnigOptionType options; Operation* p = reg->ops; - OnigOptionType option = reg->options; OnigEncoding encode = reg->enc; OnigCaseFoldType case_fold_flag = reg->case_fold_flag; @@ -2936,6 +2997,8 @@ match_at(regex_t* reg, const UChar* str, const UChar* end, } #endif + options = msa->options; + #ifdef USE_CALLOUT msa->mp->match_at_call_counter++; #endif @@ -2976,102 +3039,113 @@ match_at(regex_t* reg, const UChar* str, const UChar* end, BYTECODE_INTERPRETER_START { CASE_OP(END) n = (int )(s - sstart); + if (n == 0 && OPTON_FIND_NOT_EMPTY(options)) { + best_len = ONIG_MISMATCH; + goto fail; /* for retry */ + } + if (n > best_len) { - OnigRegion* region; #ifdef USE_FIND_LONGEST_SEARCH_ALL_OF_RANGE - if (OPTON_FIND_LONGEST(option)) { + if (OPTON_FIND_LONGEST(options)) { if (n > msa->best_len) { msa->best_len = n; msa->best_s = (UChar* )sstart; - goto set_region; } - else - goto end_best_len; + else { + if (s >= in_right_range && msa->best_s == sstart) { + best_len = msa->best_len; /* end of find */ + } + else { + SOP_OUT; + goto fail; /* for retry */ + } + } } -#endif + else { + best_len = n; + } +#else best_len = n; +#endif + } - set_region: - region = msa->region; - if (region) { - if (keep > s) keep = s; + /* set region */ + region = msa->region; + if (region) { + if (keep > s) keep = s; #ifdef USE_POSIX_API - if (OPTON_POSIX_REGION(msa->options)) { - posix_regmatch_t* rmt = (posix_regmatch_t* )region; - - rmt[0].rm_so = (regoff_t )(keep - str); - rmt[0].rm_eo = (regoff_t )(s - str); - for (i = 1; i <= num_mem; i++) { - if (mem_end_stk[i].i != INVALID_STACK_INDEX) { - rmt[i].rm_so = (regoff_t )(STACK_MEM_START(reg, i) - str); - rmt[i].rm_eo = (regoff_t )(STACK_MEM_END(reg, i) - str); - } - else { - rmt[i].rm_so = rmt[i].rm_eo = ONIG_REGION_NOTPOS; - } + if (OPTON_POSIX_REGION(options)) { + posix_regmatch_t* rmt = (posix_regmatch_t* )region; + + rmt[0].rm_so = (regoff_t )(keep - str); + rmt[0].rm_eo = (regoff_t )(s - str); + for (i = 1; i <= num_mem; i++) { + if (mem_end_stk[i].i != INVALID_STACK_INDEX) { + rmt[i].rm_so = (regoff_t )(STACK_MEM_START(reg, i) - str); + rmt[i].rm_eo = (regoff_t )(STACK_MEM_END(reg, i) - str); + } + else { + rmt[i].rm_so = rmt[i].rm_eo = ONIG_REGION_NOTPOS; } } - else { + } + else { #endif /* USE_POSIX_API */ - region->beg[0] = (int )(keep - str); - region->end[0] = (int )(s - str); - for (i = 1; i <= num_mem; i++) { - if (mem_end_stk[i].i != INVALID_STACK_INDEX) { - region->beg[i] = (int )(STACK_MEM_START(reg, i) - str); - region->end[i] = (int )(STACK_MEM_END(reg, i) - str); - } - else { - region->beg[i] = region->end[i] = ONIG_REGION_NOTPOS; - } + region->beg[0] = (int )(keep - str); + region->end[0] = (int )(s - str); + for (i = 1; i <= num_mem; i++) { + if (mem_end_stk[i].i != INVALID_STACK_INDEX) { + region->beg[i] = (int )(STACK_MEM_START(reg, i) - str); + region->end[i] = (int )(STACK_MEM_END(reg, i) - str); + } + else { + region->beg[i] = region->end[i] = ONIG_REGION_NOTPOS; } + } #ifdef USE_CAPTURE_HISTORY - if (reg->capture_history != 0) { - int r; - OnigCaptureTreeNode* node; + if (reg->capture_history != 0) { + OnigCaptureTreeNode* node; - if (IS_NULL(region->history_root)) { - region->history_root = node = history_node_new(); - CHECK_NULL_RETURN_MEMERR(node); - } - else { - node = region->history_root; - history_tree_clear(node); - } + if (IS_NULL(region->history_root)) { + region->history_root = node = history_node_new(); + CHECK_NULL_RETURN_MEMERR(node); + } + else { + node = region->history_root; + history_tree_clear(node); + } - node->group = 0; - node->beg = (int )(keep - str); - node->end = (int )(s - str); + node->group = 0; + node->beg = (int )(keep - str); + node->end = (int )(s - str); - stkp = stk_base; - r = make_capture_history_tree(region->history_root, &stkp, - stk, (UChar* )str, reg); - if (r < 0) MATCH_AT_ERROR_RETURN(r); - } + stkp = stk_base; + i = make_capture_history_tree(region->history_root, &stkp, + stk, (UChar* )str, reg); + if (i < 0) MATCH_AT_ERROR_RETURN(i); + } #endif /* USE_CAPTURE_HISTORY */ #ifdef USE_POSIX_API - } /* else OPTON_POSIX_REGION() */ + } /* else OPTON_POSIX_REGION() */ #endif - } /* if (region) */ - } /* n > best_len */ + } /* if (region) */ -#ifdef USE_FIND_LONGEST_SEARCH_ALL_OF_RANGE - end_best_len: -#endif SOP_OUT; - if (OPTON_FIND_CONDITION(option)) { - if (OPTON_FIND_NOT_EMPTY(option) && s == sstart) { + if (OPTON_CALLBACK_EACH_MATCH(options) && + IS_NOT_NULL(CallbackEachMatch)) { + i = CallbackEachMatch(str, end, sstart, region, + msa->mp->callout_user_data); + if (i < 0) MATCH_AT_ERROR_RETURN(i); + +#ifdef USE_FIND_LONGEST_SEARCH_ALL_OF_RANGE + if (! OPTON_FIND_LONGEST(options)) +#endif best_len = ONIG_MISMATCH; - goto fail; /* for retry */ - } - if (OPTON_FIND_LONGEST(option)) { - if (s >= in_right_range && msa->best_s == sstart) - best_len = msa->best_len; - else - goto fail; /* for retry */ - } + + goto fail; } /* default behavior: return first-matching result. */ @@ -3564,23 +3638,23 @@ match_at(regex_t* reg, const UChar* str, const UChar* end, CASE_OP(BEGIN_BUF) if (! ON_STR_BEGIN(s)) goto fail; - if (OPTON_NOTBOL(msa->options)) goto fail; - if (OPTON_NOT_BEGIN_STRING(msa->options)) goto fail; + if (OPTON_NOTBOL(options)) goto fail; + if (OPTON_NOT_BEGIN_STRING(options)) goto fail; INC_OP; JUMP_OUT; CASE_OP(END_BUF) if (! ON_STR_END(s)) goto fail; - if (OPTON_NOTEOL(msa->options)) goto fail; - if (OPTON_NOT_END_STRING(msa->options)) goto fail; + if (OPTON_NOTEOL(options)) goto fail; + if (OPTON_NOT_END_STRING(options)) goto fail; INC_OP; JUMP_OUT; CASE_OP(BEGIN_LINE) if (ON_STR_BEGIN(s)) { - if (OPTON_NOTBOL(msa->options)) goto fail; + if (OPTON_NOTBOL(options)) goto fail; INC_OP; JUMP_OUT; } @@ -3599,7 +3673,7 @@ match_at(regex_t* reg, const UChar* str, const UChar* end, UChar* sprev = (UChar* )onigenc_get_prev_char_head(encode, str, s); if (IS_EMPTY_STR || !ONIGENC_IS_MBC_NEWLINE(encode, sprev, end)) { #endif - if (OPTON_NOTEOL(msa->options)) goto fail; + if (OPTON_NOTEOL(options)) goto fail; INC_OP; JUMP_OUT; #ifndef USE_NEWLINE_AT_END_OF_STRING_HAS_EMPTY_LINE @@ -3624,8 +3698,8 @@ match_at(regex_t* reg, const UChar* str, const UChar* end, UChar* sprev = (UChar* )onigenc_get_prev_char_head(encode, str, s); if (IS_EMPTY_STR || !ONIGENC_IS_MBC_NEWLINE(encode, sprev, end)) { #endif - if (OPTON_NOTEOL(msa->options)) goto fail; - if (OPTON_NOT_END_STRING(msa->options)) goto fail; + if (OPTON_NOTEOL(options)) goto fail; + if (OPTON_NOT_END_STRING(options)) goto fail; INC_OP; JUMP_OUT; #ifndef USE_NEWLINE_AT_END_OF_STRING_HAS_EMPTY_LINE @@ -3634,8 +3708,8 @@ match_at(regex_t* reg, const UChar* str, const UChar* end, } else if (ONIGENC_IS_MBC_NEWLINE(encode, s, end) && ON_STR_END(s + enclen(encode, s))) { - if (OPTON_NOTEOL(msa->options)) goto fail; - if (OPTON_NOT_END_STRING(msa->options)) goto fail; + if (OPTON_NOTEOL(options)) goto fail; + if (OPTON_NOT_END_STRING(options)) goto fail; INC_OP; JUMP_OUT; } @@ -3644,8 +3718,8 @@ match_at(regex_t* reg, const UChar* str, const UChar* end, UChar* ss = s + enclen(encode, s); ss += enclen(encode, ss); if (ON_STR_END(ss)) { - if (OPTON_NOTEOL(msa->options)) goto fail; - if (OPTON_NOT_END_STRING(msa->options)) goto fail; + if (OPTON_NOTEOL(options)) goto fail; + if (OPTON_NOT_END_STRING(options)) goto fail; INC_OP; JUMP_OUT; } @@ -3657,7 +3731,7 @@ match_at(regex_t* reg, const UChar* str, const UChar* end, switch (p->check_position.type) { case CHECK_POSITION_SEARCH_START: if (s != msa->start) goto fail; - if (OPTON_NOT_BEGIN_POSITION(msa->options)) goto fail; + if (OPTON_NOT_BEGIN_POSITION(options)) goto fail; break; case CHECK_POSITION_CURRENT_RIGHT_RANGE: if (s != right_range) goto fail; @@ -3924,13 +3998,13 @@ match_at(regex_t* reg, const UChar* str, const UChar* end, } JUMP_OUT; -#ifdef USE_STUBBORN_CHECK_CAPTURES_IN_EMPTY_REPEAT +#ifdef USE_RIGID_CHECK_CAPTURES_IN_EMPTY_REPEAT CASE_OP(EMPTY_CHECK_END_MEMST) { int is_empty; mem = p->empty_check_end.mem; /* mem: null check id */ - STACK_EMPTY_CHECK_MEM(is_empty, mem, s, reg); + STACK_EMPTY_CHECK_MEM(is_empty, mem, p->empty_check_end.empty_status_mem, s, reg); INC_OP; if (is_empty) { #ifdef ONIG_DEBUG_MATCH @@ -3949,8 +4023,8 @@ match_at(regex_t* reg, const UChar* str, const UChar* end, int is_empty; mem = p->empty_check_end.mem; /* mem: null check id */ -#ifdef USE_STUBBORN_CHECK_CAPTURES_IN_EMPTY_REPEAT - STACK_EMPTY_CHECK_MEM_REC(is_empty, mem, s, reg); +#ifdef USE_RIGID_CHECK_CAPTURES_IN_EMPTY_REPEAT + STACK_EMPTY_CHECK_MEM_REC(is_empty, mem, p->empty_check_end.empty_status_mem, s, reg); #else STACK_EMPTY_CHECK_REC(is_empty, mem, s); #endif @@ -4109,6 +4183,9 @@ match_at(regex_t* reg, const UChar* str, const UChar* end, } } +#ifdef ONIG_DEBUG_CALL + fprintf(DBGFP, "CALL: id:%d, at:%ld, level:%lu\n", p->call.called_mem, s - str, subexp_call_nest_counter); +#endif addr = p->call.addr; INC_OP; STACK_PUSH_CALL_FRAME(p); p = reg->ops + addr; @@ -4425,7 +4502,7 @@ regset_search_body_position_lead(OnigRegSet* set, sr[i].state = SRS_DEAD; if (reg->optimize != OPTIMIZE_NONE) { if (reg->dist_max != INFINITE_LEN) { - if (end - range > reg->dist_max) + if (DIST_CAST(end - range) > reg->dist_max) sch_range = (UChar* )range + reg->dist_max; else sch_range = (UChar* )end; @@ -4609,7 +4686,7 @@ onig_regset_search_with_param(OnigRegSet* set, if (set->n == 0) return ONIG_MISMATCH; - if (OPTON_POSIX_REGION(option)) + if (OPTON_POSIX_REGION(option) || OPTON_CALLBACK_EACH_MATCH(option)) return ONIGERR_INVALID_ARGUMENT; r = 0; @@ -4884,7 +4961,7 @@ sunday_quick_search_step_forward(regex_t* reg, const UChar* text_range) { const UChar *s, *se, *t, *p, *end; - const UChar *tail; + const UChar *tail, *next; int skip, tlen1; int map_offset; OnigEncoding enc; @@ -4921,9 +4998,11 @@ sunday_quick_search_step_forward(regex_t* reg, s += enclen(enc, s); } while ((s - t) < skip && s < end); #else - s += skip; - if (s < end) - s = onigenc_get_right_adjust_char_head(enc, text, s); + next = s + skip; + if (next < end) + s = onigenc_get_right_adjust_char_head(enc, s, next); + else + break; #endif } @@ -5086,7 +5165,7 @@ forward_search(regex_t* reg, const UChar* str, const UChar* end, UChar* start, p = start; if (reg->dist_min != 0) { - if (end - p <= reg->dist_min) + if (DIST_CAST(end - p) <= reg->dist_min) return 0; /* fail */ if (ONIGENC_IS_SINGLEBYTE(reg->enc)) { @@ -5119,7 +5198,7 @@ forward_search(regex_t* reg, const UChar* str, const UChar* end, UChar* start, } if (p && p < range) { - if (p - start < reg->dist_min) { + if (DIST_CAST(p - start) < reg->dist_min) { retry_gate: pprev = p; p += enclen(reg->enc, p); @@ -5164,7 +5243,7 @@ forward_search(regex_t* reg, const UChar* str, const UChar* end, UChar* start, } else { if (reg->dist_max != INFINITE_LEN) { - if (p - str < reg->dist_max) { + if (DIST_CAST(p - str) < reg->dist_max) { *low = (UChar* )str; } else { @@ -5175,7 +5254,7 @@ forward_search(regex_t* reg, const UChar* str, const UChar* end, UChar* start, } } /* no needs to adjust *high, *high is used as range check only */ - if (p - str < reg->dist_min) + if (DIST_CAST(p - str) < reg->dist_min) *high = (UChar* )str; else *high = p - reg->dist_min; @@ -5260,13 +5339,13 @@ backward_search(regex_t* reg, const UChar* str, const UChar* end, UChar* s, } if (reg->dist_max != INFINITE_LEN) { - if (p - str < reg->dist_max) + if (DIST_CAST(p - str) < reg->dist_max) *low = (UChar* )str; else *low = p - reg->dist_max; if (reg->dist_min != 0) { - if (p - str < reg->dist_min) + if (DIST_CAST(p - str) < reg->dist_min) *high = (UChar* )str; else *high = p - reg->dist_min; @@ -5410,13 +5489,13 @@ search_in_range(regex_t* reg, const UChar* str, const UChar* end, if (range > start) { if (reg->anc_dist_max != INFINITE_LEN && - min_semi_end - start > reg->anc_dist_max) { + DIST_CAST(min_semi_end - start) > reg->anc_dist_max) { start = min_semi_end - reg->anc_dist_max; if (start < end) start = onigenc_get_right_adjust_char_head(reg->enc, str, start); } - if (max_semi_end - (range - 1) < reg->anc_dist_min) { - if (max_semi_end - str + 1 < reg->anc_dist_min) + if (DIST_CAST(max_semi_end - (range - 1)) < reg->anc_dist_min) { + if (DIST_CAST(max_semi_end - str + 1) < reg->anc_dist_min) goto mismatch_no_msa; else range = max_semi_end - reg->anc_dist_min + 1; @@ -5428,11 +5507,11 @@ search_in_range(regex_t* reg, const UChar* str, const UChar* end, } else { if (reg->anc_dist_max != INFINITE_LEN && - min_semi_end - range > reg->anc_dist_max) { + DIST_CAST(min_semi_end - range) > reg->anc_dist_max) { range = min_semi_end - reg->anc_dist_max; } - if (max_semi_end - start < reg->anc_dist_min) { - if (max_semi_end - str < reg->anc_dist_min) + if (DIST_CAST(max_semi_end - start) < reg->anc_dist_min) { + if (DIST_CAST(max_semi_end - str) < reg->anc_dist_min) goto mismatch_no_msa; else { start = max_semi_end - reg->anc_dist_min; @@ -5503,7 +5582,7 @@ search_in_range(regex_t* reg, const UChar* str, const UChar* end, if (reg->dist_max == INFINITE_LEN) sch_range = (UChar* )end; else { - if ((end - range) < reg->dist_max) + if (DIST_CAST(end - range) < reg->dist_max) sch_range = (UChar* )end; else { sch_range = (UChar* )range + reg->dist_max; @@ -5579,14 +5658,14 @@ search_in_range(regex_t* reg, const UChar* str, const UChar* end, else adjrange = (UChar* )end; - if (end - range > reg->dist_min) + if (DIST_CAST(end - range) > reg->dist_min) min_range = range + reg->dist_min; else min_range = end; if (reg->dist_max != INFINITE_LEN) { do { - if (end - s > reg->dist_max) + if (DIST_CAST(end - s) > reg->dist_max) sch_start = s + reg->dist_max; else { sch_start = onigenc_get_prev_char_head(reg->enc, str, end); @@ -5887,8 +5966,10 @@ onig_regset_add(OnigRegSet* set, regex_t* reg) { OnigRegion* region; +#ifdef USE_FIND_LONGEST_SEARCH_ALL_OF_RANGE if (OPTON_FIND_LONGEST(reg->options)) return ONIGERR_INVALID_ARGUMENT; +#endif if (set->n != 0 && reg->enc != set->enc) return ONIGERR_INVALID_ARGUMENT; @@ -5933,8 +6014,10 @@ onig_regset_replace(OnigRegSet* set, int at, regex_t* reg) set->n--; } else { +#ifdef USE_FIND_LONGEST_SEARCH_ALL_OF_RANGE if (OPTON_FIND_LONGEST(reg->options)) return ONIGERR_INVALID_ARGUMENT; +#endif if (set->n > 1 && reg->enc != set->enc) return ONIGERR_INVALID_ARGUMENT; @@ -6573,7 +6656,7 @@ onig_builtin_monitor(OnigCalloutArgs* args, void* user_data) tag_len = tag_end - tag_start; if (tag_len >= sizeof(buf)) tag_len = sizeof(buf) - 1; - for (i = 0; i < tag_len; i++) buf[i] = tag_start[i]; + for (i = 0; i < (int )tag_len; i++) buf[i] = tag_start[i]; buf[tag_len] = '\0'; } diff --git a/src/regint.h b/src/regint.h index 74a5c61..9856a96 100644 --- a/src/regint.h +++ b/src/regint.h @@ -4,7 +4,7 @@ regint.h - Oniguruma (regular expression library) **********************************************************************/ /*- - * Copyright (c) 2002-2020 K.Kosako + * Copyright (c) 2002-2021 K.Kosako * All rights reserved. * * Redistribution and use in source and binary forms, with or without @@ -35,6 +35,7 @@ /* #define ONIG_DEBUG_SEARCH */ /* #define ONIG_DEBUG_MATCH */ /* #define ONIG_DEBUG_MATCH_COUNTER */ +/* #define ONIG_DEBUG_CALL */ /* #define ONIG_DONT_OPTIMIZE */ /* for byte-code statistical data. */ @@ -42,7 +43,8 @@ #if defined(ONIG_DEBUG_PARSE) || defined(ONIG_DEBUG_MATCH) || \ defined(ONIG_DEBUG_SEARCH) || defined(ONIG_DEBUG_COMPILE) || \ - defined(ONIG_DEBUG_MATCH_COUNTER) || defined(ONIG_DEBUG_STATISTICS) + defined(ONIG_DEBUG_MATCH_COUNTER) || defined(ONIG_DEBUG_CALL) || \ + defined(ONIG_DEBUG_STATISTICS) #ifndef ONIG_DEBUG #define ONIG_DEBUG #define DBGFP stderr @@ -61,7 +63,7 @@ #define USE_CALL #define USE_CALLOUT #define USE_BACKREF_WITH_LEVEL /* \k<name+n>, \k<name-n> */ -#define USE_STUBBORN_CHECK_CAPTURES_IN_EMPTY_REPEAT /* /(?:()|())*\2/ */ +#define USE_RIGID_CHECK_CAPTURES_IN_EMPTY_REPEAT /* /(?:()|())*\2/ */ #define USE_NEWLINE_AT_END_OF_STRING_HAS_EMPTY_LINE /* /\n$/ =~ "\n" */ #define USE_WARNING_REDUNDANT_NESTED_REPEAT_OPERATOR #define USE_RETRY_LIMIT @@ -388,10 +390,10 @@ typedef unsigned int MemStatusType; (IS_CODE_DIGIT_ASCII(enc,code) ? DIGITVAL(code) \ : (ONIGENC_IS_CODE_UPPER(enc,code) ? (code) - 'A' + 10 : (code) - 'a' + 10)) +#define OPTON_CALLBACK_EACH_MATCH(option) \ + ((option) & ONIG_OPTION_CALLBACK_EACH_MATCH) #define OPTON_FIND_LONGEST(option) ((option) & ONIG_OPTION_FIND_LONGEST) #define OPTON_FIND_NOT_EMPTY(option) ((option) & ONIG_OPTION_FIND_NOT_EMPTY) -#define OPTON_FIND_CONDITION(option) ((option) & \ - (ONIG_OPTION_FIND_LONGEST | ONIG_OPTION_FIND_NOT_EMPTY)) #define OPTON_NEGATE_SINGLELINE(option) ((option) & \ ONIG_OPTION_NEGATE_SINGLELINE) #define OPTON_DONT_CAPTURE_GROUP(option) ((option) & \ @@ -406,8 +408,6 @@ typedef unsigned int MemStatusType; #define OPTON_NOT_END_STRING(option) ((option) & ONIG_OPTION_NOT_END_STRING) #define OPTON_NOT_BEGIN_POSITION(option) ((option) & ONIG_OPTION_NOT_BEGIN_POSITION) -#define DISABLE_CASE_FOLD_MULTI_CHAR(case_fold_flag) \ - ((case_fold_flag) & ~INTERNAL_ONIGENC_CASE_FOLD_MULTI_CHAR) #define INFINITE_REPEAT -1 #define IS_INFINITE_REPEAT(n) ((n) == INFINITE_REPEAT) @@ -437,81 +437,6 @@ typedef Bits* BitSetRef; #define BITSET_CLEAR_BIT(bs, pos) BS_ROOM(bs,pos) &= ~(BS_BIT(pos)) #define BITSET_INVERT_BIT(bs, pos) BS_ROOM(bs,pos) ^= BS_BIT(pos) -/* bytes buffer */ -typedef struct _BBuf { - UChar* p; - unsigned int used; - unsigned int alloc; -} BBuf; - -#define BB_INIT(buf,size) bbuf_init((BBuf* )(buf), (size)) - -#define BB_EXPAND(buf,low) do{\ - do { (buf)->alloc *= 2; } while ((buf)->alloc < (unsigned int )low);\ - (buf)->p = (UChar* )xrealloc((buf)->p, (buf)->alloc);\ - if (IS_NULL((buf)->p)) return(ONIGERR_MEMORY);\ -} while (0) - -#define BB_ENSURE_SIZE(buf,size) do{\ - unsigned int new_alloc = (buf)->alloc;\ - while (new_alloc < (unsigned int )(size)) { new_alloc *= 2; }\ - if ((buf)->alloc != new_alloc) {\ - (buf)->p = (UChar* )xrealloc((buf)->p, new_alloc);\ - if (IS_NULL((buf)->p)) return(ONIGERR_MEMORY);\ - (buf)->alloc = new_alloc;\ - }\ -} while (0) - -#define BB_WRITE(buf,pos,bytes,n) do{\ - int used = (pos) + (n);\ - if ((buf)->alloc < (unsigned int )used) BB_EXPAND((buf),used);\ - xmemcpy((buf)->p + (pos), (bytes), (n));\ - if ((buf)->used < (unsigned int )used) (buf)->used = used;\ -} while (0) - -#define BB_WRITE1(buf,pos,byte) do{\ - int used = (pos) + 1;\ - if ((buf)->alloc < (unsigned int )used) BB_EXPAND((buf),used);\ - (buf)->p[(pos)] = (byte);\ - if ((buf)->used < (unsigned int )used) (buf)->used = used;\ -} while (0) - -#define BB_ADD(buf,bytes,n) BB_WRITE((buf),(buf)->used,(bytes),(n)) -#define BB_ADD1(buf,byte) BB_WRITE1((buf),(buf)->used,(byte)) -#define BB_GET_ADD_ADDRESS(buf) ((buf)->p + (buf)->used) -#define BB_GET_OFFSET_POS(buf) ((buf)->used) - -/* from < to */ -#define BB_MOVE_RIGHT(buf,from,to,n) do {\ - if ((unsigned int )((to)+(n)) > (buf)->alloc) BB_EXPAND((buf),(to) + (n));\ - xmemmove((buf)->p + (to), (buf)->p + (from), (n));\ - if ((unsigned int )((to)+(n)) > (buf)->used) (buf)->used = (to) + (n);\ -} while (0) - -/* from > to */ -#define BB_MOVE_LEFT(buf,from,to,n) do {\ - xmemmove((buf)->p + (to), (buf)->p + (from), (n));\ -} while (0) - -/* from > to */ -#define BB_MOVE_LEFT_REDUCE(buf,from,to) do {\ - xmemmove((buf)->p + (to), (buf)->p + (from), (buf)->used - (from));\ - (buf)->used -= (from - to);\ -} while (0) - -#define BB_INSERT(buf,pos,bytes,n) do {\ - if (pos >= (buf)->used) {\ - BB_WRITE(buf,pos,bytes,n);\ - }\ - else {\ - BB_MOVE_RIGHT((buf),(pos),(pos) + (n),((buf)->used - (pos)));\ - xmemcpy((buf)->p + (pos), (bytes), (n));\ - }\ -} while (0) - -#define BB_GET_BYTE(buf, pos) (buf)->p[(pos)] - - /* has body */ #define ANCR_PREC_READ (1<<0) #define ANCR_PREC_READ_NOT (1<<1) @@ -884,6 +809,7 @@ typedef struct { } empty_check_start; struct { MemNumType mem; + MemStatusType empty_status_mem; } empty_check_end; /* EMPTY_CHECK_END, EMPTY_CHECK_END_MEMST, EMPTY_CHECK_END_MEMST_PUSH */ struct { RelAddrType addr; @@ -922,7 +848,7 @@ typedef struct { } update_var; struct { AbsAddrType addr; -#ifdef ONIG_DEBUG_MATCH_COUNTER +#if defined(ONIG_DEBUG_MATCH_COUNTER) || defined(ONIG_DEBUG_CALL) MemNumType called_mem; #endif } call; @@ -977,7 +903,6 @@ struct re_pattern_buffer { MemStatusType capture_history; /* (?@...) flag (1-31) */ MemStatusType push_mem_start; /* need backtrack flag */ MemStatusType push_mem_end; /* need backtrack flag */ - MemStatusType empty_status_mem; int stack_pop_level; int repeat_range_alloc; RepeatRange* repeat_range; diff --git a/src/regparse.c b/src/regparse.c index dd2824b..938a569 100644 --- a/src/regparse.c +++ b/src/regparse.c @@ -2,7 +2,7 @@ regparse.c - Oniguruma (regular expression library) **********************************************************************/ /*- - * Copyright (c) 2002-2020 K.Kosako + * Copyright (c) 2002-2021 K.Kosako * All rights reserved. * * Redistribution and use in source and binary forms, with or without @@ -159,6 +159,75 @@ OnigSyntaxType OnigSyntaxRuby = { OnigSyntaxType* OnigDefaultSyntax = ONIG_SYNTAX_ONIGURUMA; + +#define BB_INIT(buf,size) bbuf_init((BBuf* )(buf), (size)) + +#define BB_EXPAND(buf,low) do{\ + do { (buf)->alloc *= 2; } while ((buf)->alloc < (unsigned int )low);\ + (buf)->p = (UChar* )xrealloc((buf)->p, (buf)->alloc);\ + if (IS_NULL((buf)->p)) return(ONIGERR_MEMORY);\ +} while (0) + +#define BB_ENSURE_SIZE(buf,size) do{\ + unsigned int new_alloc = (buf)->alloc;\ + while (new_alloc < (unsigned int )(size)) { new_alloc *= 2; }\ + if ((buf)->alloc != new_alloc) {\ + (buf)->p = (UChar* )xrealloc((buf)->p, new_alloc);\ + if (IS_NULL((buf)->p)) return(ONIGERR_MEMORY);\ + (buf)->alloc = new_alloc;\ + }\ +} while (0) + +#define BB_WRITE(buf,pos,bytes,n) do{\ + int used = (pos) + (n);\ + if ((buf)->alloc < (unsigned int )used) BB_EXPAND((buf),used);\ + xmemcpy((buf)->p + (pos), (bytes), (n));\ + if ((buf)->used < (unsigned int )used) (buf)->used = used;\ +} while (0) + +#define BB_WRITE1(buf,pos,byte) do{\ + int used = (pos) + 1;\ + if ((buf)->alloc < (unsigned int )used) BB_EXPAND((buf),used);\ + (buf)->p[(pos)] = (byte);\ + if ((buf)->used < (unsigned int )used) (buf)->used = used;\ +} while (0) + +#define BB_ADD(buf,bytes,n) BB_WRITE((buf),(buf)->used,(bytes),(n)) +#define BB_ADD1(buf,byte) BB_WRITE1((buf),(buf)->used,(byte)) +#define BB_GET_ADD_ADDRESS(buf) ((buf)->p + (buf)->used) +#define BB_GET_OFFSET_POS(buf) ((buf)->used) + +/* from < to */ +#define BB_MOVE_RIGHT(buf,from,to,n) do {\ + if ((unsigned int )((to)+(n)) > (buf)->alloc) BB_EXPAND((buf),(to) + (n));\ + xmemmove((buf)->p + (to), (buf)->p + (from), (n));\ + if ((unsigned int )((to)+(n)) > (buf)->used) (buf)->used = (to) + (n);\ +} while (0) + +/* from > to */ +#define BB_MOVE_LEFT(buf,from,to,n) do {\ + xmemmove((buf)->p + (to), (buf)->p + (from), (n));\ +} while (0) + +/* from > to */ +#define BB_MOVE_LEFT_REDUCE(buf,from,to) do {\ + xmemmove((buf)->p + (to), (buf)->p + (from), (buf)->used - (from));\ + (buf)->used -= (from - to);\ +} while (0) + +#define BB_INSERT(buf,pos,bytes,n) do {\ + if (pos >= (buf)->used) {\ + BB_WRITE(buf,pos,bytes,n);\ + }\ + else {\ + BB_MOVE_RIGHT((buf),(pos),(pos) + (n),((buf)->used - (pos)));\ + xmemcpy((buf)->p + (pos), (bytes), (n));\ + }\ +} while (0) + +#define BB_GET_BYTE(buf, pos) (buf)->p[(pos)] + + typedef enum { CS_VALUE, CS_RANGE, @@ -300,7 +369,7 @@ bbuf_clone(BBuf** rto, BBuf* from) } static int -backref_rel_to_abs(int rel_no, ScanEnv* env) +backref_rel_to_abs(int rel_no, ParseEnv* env) { if (rel_no > 0) { if (rel_no > ONIG_INT_MAX - env->num_mem) @@ -981,7 +1050,7 @@ onig_number_of_names(regex_t* reg) #endif /* else USE_ST_LIBRARY */ static int -name_add(regex_t* reg, UChar* name, UChar* name_end, int backref, ScanEnv* env) +name_add(regex_t* reg, UChar* name, UChar* name_end, int backref, ParseEnv* env) { int r; int alloc; @@ -1115,7 +1184,7 @@ onig_name_to_group_numbers(regex_t* reg, const UChar* name, } static int -name_to_group_numbers(ScanEnv* env, const UChar* name, const UChar* name_end, +name_to_group_numbers(ParseEnv* env, const UChar* name, const UChar* name_end, int** nums) { regex_t* reg; @@ -1920,7 +1989,7 @@ callout_tag_table_new(CalloutTagTable** rt) } static int -callout_tag_entry_raw(ScanEnv* env, CalloutTagTable* t, UChar* name, +callout_tag_entry_raw(ParseEnv* env, CalloutTagTable* t, UChar* name, UChar* name_end, CalloutTagVal entry_val) { int r; @@ -1963,7 +2032,7 @@ ext_ensure_tag_table(regex_t* reg) } static int -callout_tag_entry(ScanEnv* env, regex_t* reg, UChar* name, UChar* name_end, +callout_tag_entry(ParseEnv* env, regex_t* reg, UChar* name, UChar* name_end, CalloutTagVal entry_val) { int r; @@ -1988,10 +2057,10 @@ callout_tag_entry(ScanEnv* env, regex_t* reg, UChar* name, UChar* name_end, #endif /* USE_CALLOUT */ -#define INIT_SCANENV_MEMENV_ALLOC_SIZE 16 +#define INIT_PARSEENV_MEMENV_ALLOC_SIZE 16 static void -scan_env_clear(ScanEnv* env) +scan_env_clear(ParseEnv* env) { MEM_STATUS_CLEAR(env->cap_history); MEM_STATUS_CLEAR(env->backtrack_mem); @@ -2024,7 +2093,7 @@ scan_env_clear(ScanEnv* env) } static int -scan_env_add_mem_entry(ScanEnv* env) +scan_env_add_mem_entry(ParseEnv* env) { int i, need, alloc; MemEnv* p; @@ -2033,10 +2102,10 @@ scan_env_add_mem_entry(ScanEnv* env) if (need > MaxCaptureNum && MaxCaptureNum != 0) return ONIGERR_TOO_MANY_CAPTURES; - if (need >= SCANENV_MEMENV_SIZE) { + if (need >= PARSEENV_MEMENV_SIZE) { if (env->mem_alloc <= need) { if (IS_NULL(env->mem_env_dynamic)) { - alloc = INIT_SCANENV_MEMENV_ALLOC_SIZE; + alloc = INIT_PARSEENV_MEMENV_ALLOC_SIZE; p = (MemEnv* )xmalloc(sizeof(MemEnv) * alloc); CHECK_NULL_RETURN_MEMERR(p); xmemcpy(p, env->mem_env_static, sizeof(env->mem_env_static)); @@ -2062,10 +2131,10 @@ scan_env_add_mem_entry(ScanEnv* env) } static int -scan_env_set_mem_node(ScanEnv* env, int num, Node* node) +scan_env_set_mem_node(ParseEnv* env, int num, Node* node) { if (env->num_mem >= num) - SCANENV_MEMENV(env)[num].mem_node = node; + PARSEENV_MEMENV(env)[num].mem_node = node; else return ONIGERR_PARSER_BUG; return 0; @@ -2285,7 +2354,7 @@ node_new_anychar(OnigOptionType options) } static int -node_new_no_newline(Node** node, ScanEnv* env) +node_new_no_newline(Node** node, ParseEnv* env) { Node* n; @@ -2425,7 +2494,7 @@ node_new_backref(int back_num, int* backrefs, int by_name, #ifdef USE_BACKREF_WITH_LEVEL int exist_level, int nest_level, #endif - ScanEnv* env) + ParseEnv* env) { int i; Node* node; @@ -2451,7 +2520,7 @@ node_new_backref(int back_num, int* backrefs, int by_name, for (i = 0; i < back_num; i++) { if (backrefs[i] <= env->num_mem && - IS_NULL(SCANENV_MEMENV(env)[backrefs[i]].mem_node)) { + IS_NULL(PARSEENV_MEMENV(env)[backrefs[i]].mem_node)) { NODE_STATUS_ADD(node, RECURSION); /* /...(\1).../ */ break; } @@ -2481,7 +2550,7 @@ node_new_backref_checker(int back_num, int* backrefs, int by_name, #ifdef USE_BACKREF_WITH_LEVEL int exist_level, int nest_level, #endif - ScanEnv* env) + ParseEnv* env) { Node* node; @@ -2527,6 +2596,7 @@ node_new_quantifier(int lower, int upper, int by_number) QUANT_(node)->head_exact = NULL_NODE; QUANT_(node)->next_head_exact = NULL_NODE; QUANT_(node)->include_referred = 0; + QUANT_(node)->empty_status_mem = 0; if (by_number != 0) NODE_STATUS_ADD(node, BY_NUMBER); @@ -2640,7 +2710,7 @@ node_set_fail(Node* node) } static int -node_new_fail(Node** node, ScanEnv* env) +node_new_fail(Node** node, ParseEnv* env) { *node = node_new(); CHECK_NULL_RETURN_MEMERR(*node); @@ -2656,7 +2726,7 @@ onig_node_reset_fail(Node* node) } static int -node_new_save_gimmick(Node** node, enum SaveType save_type, ScanEnv* env) +node_new_save_gimmick(Node** node, enum SaveType save_type, ParseEnv* env) { int id; @@ -2675,7 +2745,7 @@ node_new_save_gimmick(Node** node, enum SaveType save_type, ScanEnv* env) static int node_new_update_var_gimmick(Node** node, enum UpdateVarType update_var_type, - int id, ScanEnv* env) + int id, ParseEnv* env) { *node = node_new(); CHECK_NULL_RETURN_MEMERR(*node); @@ -2689,7 +2759,7 @@ node_new_update_var_gimmick(Node** node, enum UpdateVarType update_var_type, } static int -node_new_keep(Node** node, ScanEnv* env) +node_new_keep(Node** node, ParseEnv* env) { int r; @@ -2743,7 +2813,7 @@ onig_reg_callout_list_at(regex_t* reg, int num) } static int -reg_callout_list_entry(ScanEnv* env, int* rnum) +reg_callout_list_entry(ParseEnv* env, int* rnum) { #define INIT_CALLOUT_LIST_NUM 3 @@ -2795,7 +2865,7 @@ reg_callout_list_entry(ScanEnv* env, int* rnum) static int node_new_callout(Node** node, OnigCalloutOf callout_of, int num, int id, - ScanEnv* env) + ParseEnv* env) { *node = node_new(); CHECK_NULL_RETURN_MEMERR(*node); @@ -2811,7 +2881,7 @@ node_new_callout(Node** node, OnigCalloutOf callout_of, int num, int id, #endif static int -make_text_segment(Node** node, ScanEnv* env) +make_text_segment(Node** node, ParseEnv* env) { int r; int i; @@ -2868,7 +2938,7 @@ make_text_segment(Node** node, ScanEnv* env) static int make_absent_engine(Node** node, int pre_save_right_id, Node* absent, Node* step_one, int lower, int upper, int possessive, - int is_range_cutter, ScanEnv* env) + int is_range_cutter, ParseEnv* env) { int r; int i; @@ -2950,7 +3020,7 @@ make_absent_engine(Node** node, int pre_save_right_id, Node* absent, static int make_absent_tail(Node** node1, Node** node2, int pre_save_right_id, - ScanEnv* env) + ParseEnv* env) { int r; int id; @@ -2998,7 +3068,7 @@ make_absent_tail(Node** node1, Node** node2, int pre_save_right_id, } static int -make_range_clear(Node** node, ScanEnv* env) +make_range_clear(Node** node, ParseEnv* env) { int r; int id; @@ -3057,7 +3127,7 @@ make_range_clear(Node** node, ScanEnv* env) static int is_simple_one_char_repeat(Node* node, Node** rquant, Node** rbody, - int* is_possessive, ScanEnv* env) + int* is_possessive, ParseEnv* env) { Node* quant; Node* body; @@ -3123,8 +3193,8 @@ is_simple_one_char_repeat(Node* node, Node** rquant, Node** rbody, } static int -make_absent_tree_for_simple_one_char_repeat(Node** node, Node* absent, Node* quant, - Node* body, int possessive, ScanEnv* env) +make_absent_tree_for_simple_one_char_repeat(Node** node, + Node* absent, Node* quant, Node* body, int possessive, ParseEnv* env) { int r; int i; @@ -3171,7 +3241,7 @@ make_absent_tree_for_simple_one_char_repeat(Node** node, Node* absent, Node* qua static int make_absent_tree(Node** node, Node* absent, Node* expr, int is_range_cutter, - ScanEnv* env) + ParseEnv* env) { int r; int i; @@ -3844,7 +3914,7 @@ add_code_range_to_buf(BBuf** pbuf, OnigCodePoint from, OnigCodePoint to) } static int -add_code_range(BBuf** pbuf, ScanEnv* env, OnigCodePoint from, OnigCodePoint to) +add_code_range(BBuf** pbuf, ParseEnv* env, OnigCodePoint from, OnigCodePoint to) { if (from > to) { if (IS_SYNTAX_BV(env->syntax, ONIG_SYN_ALLOW_EMPTY_RANGE_IN_CC)) @@ -4172,7 +4242,7 @@ or_cclass(CClassNode* dest, CClassNode* cc, OnigEncoding enc) } static OnigCodePoint -conv_backslash_value(OnigCodePoint c, ScanEnv* env) +conv_backslash_value(OnigCodePoint c, ParseEnv* env) { if (IS_SYNTAX_OP(env->syntax, ONIG_SYN_OP_ESC_CONTROL_CHARS)) { switch (c) { @@ -4258,10 +4328,10 @@ enum ReduceType { RQ_ASIS = 0, /* as is */ RQ_DEL = 1, /* delete parent */ RQ_A, /* to '*' */ + RQ_P, /* to '+' */ RQ_AQ, /* to '*?' */ RQ_QQ, /* to '??' */ RQ_P_QQ, /* to '+)??' */ - RQ_PQ_Q /* to '+?)?' */ }; static enum ReduceType ReduceTypeTable[6][6] = { @@ -4270,7 +4340,7 @@ static enum ReduceType ReduceTypeTable[6][6] = { {RQ_A, RQ_A, RQ_DEL, RQ_ASIS, RQ_P_QQ, RQ_DEL}, /* '+' */ {RQ_DEL, RQ_AQ, RQ_AQ, RQ_DEL, RQ_AQ, RQ_AQ}, /* '??' */ {RQ_DEL, RQ_DEL, RQ_DEL, RQ_DEL, RQ_DEL, RQ_DEL}, /* '*?' */ - {RQ_ASIS, RQ_PQ_Q, RQ_DEL, RQ_AQ, RQ_AQ, RQ_DEL} /* '+?' */ + {RQ_ASIS, RQ_A, RQ_P, RQ_AQ, RQ_AQ, RQ_DEL} /* '+?' */ }; extern int @@ -4309,6 +4379,11 @@ onig_reduce_nested_quantifier(Node* pnode) p->lower = 0; p->upper = INFINITE_REPEAT; p->greedy = 1; goto remove_cnode; break; + case RQ_P: + NODE_BODY(pnode) = NODE_BODY(cnode); + p->lower = 1; p->upper = INFINITE_REPEAT; p->greedy = 1; + goto remove_cnode; + break; case RQ_AQ: NODE_BODY(pnode) = NODE_BODY(cnode); p->lower = 0; p->upper = INFINITE_REPEAT; p->greedy = 0; @@ -4323,10 +4398,6 @@ onig_reduce_nested_quantifier(Node* pnode) p->lower = 0; p->upper = 1; p->greedy = 0; c->lower = 1; c->upper = INFINITE_REPEAT; c->greedy = 1; break; - case RQ_PQ_Q: - p->lower = 0; p->upper = 1; p->greedy = 1; - c->lower = 1; c->upper = INFINITE_REPEAT; c->greedy = 0; - break; case RQ_ASIS: break; } @@ -4340,7 +4411,7 @@ onig_reduce_nested_quantifier(Node* pnode) } static int -node_new_general_newline(Node** node, ScanEnv* env) +node_new_general_newline(Node** node, ParseEnv* env) { int r; int dlen, alen; @@ -4472,7 +4543,7 @@ ptoken_init(PToken* tok) } static int -fetch_interval(UChar** src, UChar* end, PToken* tok, ScanEnv* env) +fetch_interval(UChar** src, UChar* end, PToken* tok, ParseEnv* env) { int low, up, syn_allow, non_low = 0; int r = 0; @@ -4575,7 +4646,8 @@ fetch_interval(UChar** src, UChar* end, PToken* tok, ScanEnv* env) /* \M-, \C-, \c, or \... */ static int -fetch_escaped_value_raw(UChar** src, UChar* end, ScanEnv* env, OnigCodePoint* val) +fetch_escaped_value_raw(UChar** src, UChar* end, ParseEnv* env, + OnigCodePoint* val) { int v; OnigCodePoint c; @@ -4646,7 +4718,7 @@ fetch_escaped_value_raw(UChar** src, UChar* end, ScanEnv* env, OnigCodePoint* va } static int -fetch_escaped_value(UChar** src, UChar* end, ScanEnv* env, OnigCodePoint* val) +fetch_escaped_value(UChar** src, UChar* end, ParseEnv* env, OnigCodePoint* val) { int r; int len; @@ -4660,7 +4732,7 @@ fetch_escaped_value(UChar** src, UChar* end, ScanEnv* env, OnigCodePoint* val) return 0; } -static int fetch_token(PToken* tok, UChar** src, UChar* end, ScanEnv* env); +static int fetch_token(PToken* tok, UChar** src, UChar* end, ParseEnv* env); static OnigCodePoint get_name_end_code_point(OnigCodePoint start) @@ -4691,7 +4763,7 @@ enum REF_NUM { */ static int fetch_name_with_level(OnigCodePoint start_code, UChar** src, UChar* end, - UChar** rname_end, ScanEnv* env, + UChar** rname_end, ParseEnv* env, int* rback_num, int* rlevel, enum REF_NUM* num_type) { int r, sign, exist_level; @@ -4825,7 +4897,7 @@ fetch_name_with_level(OnigCodePoint start_code, UChar** src, UChar* end, */ static int fetch_name(OnigCodePoint start_code, UChar** src, UChar* end, - UChar** rname_end, ScanEnv* env, int* rback_num, + UChar** rname_end, ParseEnv* env, int* rback_num, enum REF_NUM* num_type, int is_ref) { int r, sign; @@ -4957,7 +5029,7 @@ fetch_name(OnigCodePoint start_code, UChar** src, UChar* end, } static void -CC_ESC_WARN(ScanEnv* env, UChar *c) +CC_ESC_WARN(ParseEnv* env, UChar *c) { if (onig_warn == onig_null_warn) return ; @@ -4973,7 +5045,7 @@ CC_ESC_WARN(ScanEnv* env, UChar *c) } static void -CLOSE_BRACKET_WITHOUT_ESC_WARN(ScanEnv* env, UChar* c) +CLOSE_BRACKET_WITHOUT_ESC_WARN(ParseEnv* env, UChar* c) { if (onig_warn == onig_null_warn) return ; @@ -5054,11 +5126,12 @@ str_exist_check_with_esc(OnigCodePoint s[], int n, UChar* from, UChar* to, } static int -fetch_token_cc(PToken* tok, UChar** src, UChar* end, ScanEnv* env, int state) +fetch_token_cc(PToken* tok, UChar** src, UChar* end, ParseEnv* env, int state) { int r; OnigCodePoint code; OnigCodePoint c, c2; + int mindigits, maxdigits; OnigSyntaxType* syn = env->syntax; OnigEncoding enc = env->enc; UChar* prev; @@ -5247,10 +5320,11 @@ fetch_token_cc(PToken* tok, UChar** src, UChar* end, ScanEnv* env, int state) case 'u': if (PEND) break; - prev = p; if (IS_SYNTAX_OP2(syn, ONIG_SYN_OP2_ESC_U_HEX4)) { - r = scan_hexadecimal_number(&p, end, 4, 4, enc, &code); + mindigits = maxdigits = 4; + u_hex_digits: + r = scan_hexadecimal_number(&p, end, mindigits, maxdigits, enc, &code); if (r < 0) return r; if (p == prev) { /* can't read nothing. */ code = 0; /* but, it's not error */ @@ -5261,6 +5335,15 @@ fetch_token_cc(PToken* tok, UChar** src, UChar* end, ScanEnv* env, int state) } break; + case 'U': + if (PEND) break; + prev = p; + if (IS_SYNTAX_BV(syn, ONIG_SYN_PYTHON)) { + mindigits = maxdigits = 8; + goto u_hex_digits; + } + break; + case '0': case '1': case '2': case '3': case '4': case '5': case '6': case '7': if (IS_SYNTAX_OP(syn, ONIG_SYN_OP_ESC_OCTAL3)) { @@ -5327,15 +5410,22 @@ fetch_token_cc(PToken* tok, UChar** src, UChar* end, ScanEnv* env, int state) } static int -fetch_token(PToken* tok, UChar** src, UChar* end, ScanEnv* env) +fetch_token(PToken* tok, UChar** src, UChar* end, ParseEnv* env) { int r; OnigCodePoint code; OnigCodePoint c; - OnigEncoding enc = env->enc; - OnigSyntaxType* syn = env->syntax; + int mindigits, maxdigits; UChar* prev; - UChar* p = *src; + int allow_num; + OnigEncoding enc; + OnigSyntaxType* syn; + UChar* p; + + enc = env->enc; + syn = env->syntax; + p = *src; + PFETCH_READY; if (tok->code_point_continue != 0) { @@ -5574,12 +5664,20 @@ fetch_token(PToken* tok, UChar** src, UChar* end, ScanEnv* env) break; case 'Z': - if (! IS_SYNTAX_OP(syn, ONIG_SYN_OP_ESC_AZ_BUF_ANCHOR)) break; - tok->type = TK_ANCHOR; - tok->u.subtype = ANCR_SEMI_END_BUF; + if (IS_SYNTAX_BV(syn, ONIG_SYN_PYTHON)) { + goto end_buf; + } + else { + if (! IS_SYNTAX_OP(syn, ONIG_SYN_OP_ESC_AZ_BUF_ANCHOR)) break; + tok->type = TK_ANCHOR; + tok->u.subtype = ANCR_SEMI_END_BUF; + } break; case 'z': + if (IS_SYNTAX_BV(syn, ONIG_SYN_PYTHON)) + return ONIGERR_UNDEFINED_OPERATOR; + if (! IS_SYNTAX_OP(syn, ONIG_SYN_OP_ESC_AZ_BUF_ANCHOR)) break; end_buf: tok->type = TK_ANCHOR; @@ -5668,10 +5766,11 @@ fetch_token(PToken* tok, UChar** src, UChar* end, ScanEnv* env) case 'u': if (PEND) break; - prev = p; + mindigits = maxdigits = 4; if (IS_SYNTAX_OP2(syn, ONIG_SYN_OP2_ESC_U_HEX4)) { - r = scan_hexadecimal_number(&p, end, 4, 4, enc, &code); + u_hex_digits: + r = scan_hexadecimal_number(&p, end, mindigits, maxdigits, enc, &code); if (r < 0) return r; if (p == prev) { /* can't read nothing. */ code = 0; /* but, it's not error */ @@ -5682,6 +5781,15 @@ fetch_token(PToken* tok, UChar** src, UChar* end, ScanEnv* env) } break; + case 'U': + if (PEND) break; + prev = p; + if (IS_SYNTAX_BV(syn, ONIG_SYN_PYTHON)) { + mindigits = maxdigits = 8; + goto u_hex_digits; + } + break; + case '1': case '2': case '3': case '4': case '5': case '6': case '7': case '8': case '9': PUNFETCH; @@ -5694,7 +5802,7 @@ fetch_token(PToken* tok, UChar** src, UChar* end, ScanEnv* env) if (IS_SYNTAX_OP(syn, ONIG_SYN_OP_DECIMAL_BACKREF) && (r <= env->num_mem || r <= 9)) { /* This spec. from GNU regex */ if (IS_SYNTAX_BV(syn, ONIG_SYN_STRICT_CHECK_BACKREF)) { - if (r > env->num_mem || IS_NULL(SCANENV_MEMENV(env)[r].mem_node)) + if (r > env->num_mem || IS_NULL(PARSEENV_MEMENV(env)[r].mem_node)) return ONIGERR_INVALID_BACKREF; } @@ -5743,6 +5851,9 @@ fetch_token(PToken* tok, UChar** src, UChar* end, ScanEnv* env) int back_num; enum REF_NUM num_type; + allow_num = 1; + + backref_start: prev = p; #ifdef USE_BACKREF_WITH_LEVEL @@ -5757,6 +5868,8 @@ fetch_token(PToken* tok, UChar** src, UChar* end, ScanEnv* env) if (r < 0) return r; if (num_type != IS_NOT_NUM) { + if (allow_num == 0) return ONIGERR_INVALID_BACKREF; + if (num_type == IS_REL_NUM) { back_num = backref_rel_to_abs(back_num, env); } @@ -5765,7 +5878,7 @@ fetch_token(PToken* tok, UChar** src, UChar* end, ScanEnv* env) if (IS_SYNTAX_BV(syn, ONIG_SYN_STRICT_CHECK_BACKREF)) { if (back_num > env->num_mem || - IS_NULL(SCANENV_MEMENV(env)[back_num].mem_node)) + IS_NULL(PARSEENV_MEMENV(env)[back_num].mem_node)) return ONIGERR_INVALID_BACKREF; } tok->type = TK_BACKREF; @@ -5782,7 +5895,7 @@ fetch_token(PToken* tok, UChar** src, UChar* end, ScanEnv* env) int i; for (i = 0; i < num; i++) { if (backs[i] > env->num_mem || - IS_NULL(SCANENV_MEMENV(env)[backs[i]].mem_node)) + IS_NULL(PARSEENV_MEMENV(env)[backs[i]].mem_node)) return ONIGERR_INVALID_BACKREF; } } @@ -5813,12 +5926,17 @@ fetch_token(PToken* tok, UChar** src, UChar* end, ScanEnv* env) UChar* name_end; enum REF_NUM num_type; + allow_num = 1; + + call_start: prev = p; r = fetch_name((OnigCodePoint )c, &p, end, &name_end, env, &gnum, &num_type, TRUE); if (r < 0) return r; if (num_type != IS_NOT_NUM) { + if (allow_num == 0) return ONIGERR_UNDEFINED_GROUP_REFERENCE; + if (num_type == IS_REL_NUM) { gnum = backref_rel_to_abs(gnum, env); if (gnum < 0) { @@ -5975,6 +6093,7 @@ fetch_token(PToken* tok, UChar** src, UChar* end, ScanEnv* env) case '(': if (!PEND && PPEEK_IS('?') && IS_SYNTAX_OP2(syn, ONIG_SYN_OP2_QMARK_GROUP_EFFECT)) { + prev = p; PINC; if (! PEND) { c = PPEEK; @@ -6062,11 +6181,35 @@ fetch_token(PToken* tok, UChar** src, UChar* end, ScanEnv* env) break; } } + else if (c == 'P' && + IS_SYNTAX_OP2(env->syntax, ONIG_SYN_OP2_QMARK_CAPITAL_P_NAME)) { + PINC; /* skip 'P' */ + if (PEND) return ONIGERR_END_PATTERN_IN_GROUP; + PFETCH(c); + allow_num = 0; + if (c == '=') { + c = '('; + goto backref_start; + } + else if (c == '>') { +#ifdef USE_CALL + c = '('; + goto call_start; +#else + return ONIGERR_UNDEFINED_OPERATOR; +#endif + } + else { + p = prev; + goto lparen_qmark_end2; + } + } } lparen_qmark_end: PUNFETCH; } + lparen_qmark_end2: if (! IS_SYNTAX_OP(syn, ONIG_SYN_OP_LPAREN_SUBEXP)) break; tok->type = TK_SUBEXP_OPEN; break; @@ -6295,7 +6438,7 @@ add_ctype_to_cc_by_range_limit(CClassNode* cc, int ctype ARG_UNUSED, int not, } static int -add_ctype_to_cc(CClassNode* cc, int ctype, int not, ScanEnv* env) +add_ctype_to_cc(CClassNode* cc, int ctype, int not, ParseEnv* env) { int c, r; int ascii_mode; @@ -6398,7 +6541,7 @@ add_ctype_to_cc(CClassNode* cc, int ctype, int not, ScanEnv* env) } static int -prs_posix_bracket(CClassNode* cc, UChar** src, UChar* end, ScanEnv* env) +prs_posix_bracket(CClassNode* cc, UChar** src, UChar* end, ParseEnv* env) { #define POSIX_BRACKET_CHECK_LIMIT_LENGTH 20 #define POSIX_BRACKET_NAME_MIN_LEN 4 @@ -6472,7 +6615,7 @@ prs_posix_bracket(CClassNode* cc, UChar** src, UChar* end, ScanEnv* env) } static int -fetch_char_property_to_ctype(UChar** src, UChar* end, ScanEnv* env) +fetch_char_property_to_ctype(UChar** src, UChar* end, ParseEnv* env) { int r; OnigCodePoint c; @@ -6507,7 +6650,8 @@ fetch_char_property_to_ctype(UChar** src, UChar* end, ScanEnv* env) } static int -prs_char_property(Node** np, PToken* tok, UChar** src, UChar* end, ScanEnv* env) +prs_char_property(Node** np, PToken* tok, UChar** src, UChar* end, + ParseEnv* env) { int r, ctype; CClassNode* cc; @@ -6528,7 +6672,7 @@ prs_char_property(Node** np, PToken* tok, UChar** src, UChar* end, ScanEnv* env) static int cc_cprop_next(CClassNode* cc, OnigCodePoint* pcode, CVAL* val, CSTATE* state, - ScanEnv* env) + ParseEnv* env) { int r; @@ -6552,7 +6696,7 @@ cc_cprop_next(CClassNode* cc, OnigCodePoint* pcode, CVAL* val, CSTATE* state, static int cc_char_next(CClassNode* cc, OnigCodePoint *from, OnigCodePoint to, int* from_raw, int to_raw, CVAL intype, CVAL* type, - CSTATE* state, ScanEnv* env) + CSTATE* state, ParseEnv* env) { int r; @@ -6621,7 +6765,7 @@ cc_char_next(CClassNode* cc, OnigCodePoint *from, OnigCodePoint to, static int code_exist_check(OnigCodePoint c, UChar* from, UChar* end, int ignore_escaped, - ScanEnv* env) + ParseEnv* env) { int in_esc; OnigCodePoint code; @@ -6643,7 +6787,7 @@ code_exist_check(OnigCodePoint c, UChar* from, UChar* end, int ignore_escaped, } static int -prs_cc(Node** np, PToken* tok, UChar** src, UChar* end, ScanEnv* env) +prs_cc(Node** np, PToken* tok, UChar** src, UChar* end, ParseEnv* env) { int r, neg, len, fetched, and_start; OnigCodePoint in_code, curr_code; @@ -6995,13 +7139,14 @@ prs_cc(Node** np, PToken* tok, UChar** src, UChar* end, ScanEnv* env) } static int prs_alts(Node** top, PToken* tok, int term, - UChar** src, UChar* end, ScanEnv* env, int group_head); + UChar** src, UChar* end, ParseEnv* env, int group_head); #ifdef USE_CALLOUT /* (?{...}[tag][+-]) (?{{...}}[tag][+-]) */ static int -prs_callout_of_contents(Node** np, int cterm, UChar** src, UChar* end, ScanEnv* env) +prs_callout_of_contents(Node** np, int cterm, UChar** src, UChar* end, + ParseEnv* env) { int r; int i; @@ -7184,7 +7329,7 @@ clear_callout_args(int n, unsigned int types[], OnigValue vals[]) static int prs_callout_args(int skip_mode, int cterm, UChar** src, UChar* end, int max_arg_num, unsigned int types[], OnigValue vals[], - ScanEnv* env) + ParseEnv* env) { #define MAX_CALLOUT_ARG_BYTE_LENGTH 128 @@ -7347,7 +7492,8 @@ prs_callout_args(int skip_mode, int cterm, UChar** src, UChar* end, /* (*name[TAG]) (*name[TAG]{a,b,..}) */ static int -prs_callout_of_name(Node** np, int cterm, UChar** src, UChar* end, ScanEnv* env) +prs_callout_of_name(Node** np, int cterm, UChar** src, UChar* end, + ParseEnv* env) { int r; int i; @@ -7514,7 +7660,7 @@ prs_callout_of_name(Node** np, int cterm, UChar** src, UChar* end, ScanEnv* env) static int prs_bag(Node** np, PToken* tok, int term, UChar** src, UChar* end, - ScanEnv* env) + ParseEnv* env) { int r, num; Node *target; @@ -7747,7 +7893,7 @@ prs_bag(Node** np, PToken* tok, int term, UChar** src, UChar* end, if (IS_SYNTAX_BV(env->syntax, ONIG_SYN_STRICT_CHECK_BACKREF)) { if (back_num > env->num_mem || - IS_NULL(SCANENV_MEMENV(env)[back_num].mem_node)) + IS_NULL(PARSEENV_MEMENV(env)[back_num].mem_node)) return ONIGERR_INVALID_BACKREF; } @@ -7769,7 +7915,7 @@ prs_bag(Node** np, PToken* tok, int term, UChar** src, UChar* end, int i; for (i = 0; i < num; i++) { if (backs[i] > env->num_mem || - IS_NULL(SCANENV_MEMENV(env)[backs[i]].mem_node)) + IS_NULL(PARSEENV_MEMENV(env)[backs[i]].mem_node)) return ONIGERR_INVALID_BACKREF; } } @@ -7932,12 +8078,26 @@ prs_bag(Node** np, PToken* tok, int term, UChar** src, UChar* end, break; #endif + case 'P': + if (IS_SYNTAX_OP2(env->syntax, ONIG_SYN_OP2_QMARK_CAPITAL_P_NAME)) { + if (PEND) return ONIGERR_END_PATTERN_IN_GROUP; + PFETCH(c); + if (c == '<') goto named_group1; + + return ONIGERR_UNDEFINED_GROUP_OPTION; + } + /* else fall */ + case 'W': case 'D': case 'S': + case 'y': + if (! IS_SYNTAX_OP2(env->syntax, ONIG_SYN_OP2_OPTION_ONIGURUMA)) + return ONIGERR_UNDEFINED_GROUP_OPTION; + /* else fall */ + #ifdef USE_POSIXLINE_OPTION case 'p': #endif + case 'a': case '-': case 'i': case 'm': case 's': case 'x': - case 'W': case 'D': case 'S': case 'P': - case 'y': { int neg = 0; @@ -7974,10 +8134,26 @@ prs_bag(Node** np, PToken* tok, int term, UChar** src, UChar* end, OPTION_NEGATE(option, ONIG_OPTION_MULTILINE|ONIG_OPTION_SINGLELINE, neg); break; #endif - case 'W': OPTION_NEGATE(option, ONIG_OPTION_WORD_IS_ASCII, neg); break; - case 'D': OPTION_NEGATE(option, ONIG_OPTION_DIGIT_IS_ASCII, neg); break; - case 'S': OPTION_NEGATE(option, ONIG_OPTION_SPACE_IS_ASCII, neg); break; - case 'P': OPTION_NEGATE(option, ONIG_OPTION_POSIX_IS_ASCII, neg); break; + case 'W': + if (! IS_SYNTAX_OP2(env->syntax, ONIG_SYN_OP2_OPTION_ONIGURUMA)) + return ONIGERR_UNDEFINED_GROUP_OPTION; + OPTION_NEGATE(option, ONIG_OPTION_WORD_IS_ASCII, neg); + break; + case 'D': + if (! IS_SYNTAX_OP2(env->syntax, ONIG_SYN_OP2_OPTION_ONIGURUMA)) + return ONIGERR_UNDEFINED_GROUP_OPTION; + OPTION_NEGATE(option, ONIG_OPTION_DIGIT_IS_ASCII, neg); + break; + case 'S': + if (! IS_SYNTAX_OP2(env->syntax, ONIG_SYN_OP2_OPTION_ONIGURUMA)) + return ONIGERR_UNDEFINED_GROUP_OPTION; + OPTION_NEGATE(option, ONIG_OPTION_SPACE_IS_ASCII, neg); + break; + case 'P': + if (! IS_SYNTAX_OP2(env->syntax, ONIG_SYN_OP2_OPTION_ONIGURUMA)) + return ONIGERR_UNDEFINED_GROUP_OPTION; + OPTION_NEGATE(option, ONIG_OPTION_POSIX_IS_ASCII, neg); + break; case 'y': /* y{g}, y{w} */ { @@ -8016,8 +8192,15 @@ prs_bag(Node** np, PToken* tok, int term, UChar** src, UChar* end, PFETCH(c); if (c != '}') return ONIGERR_UNDEFINED_GROUP_OPTION; - break; } /* case 'y' */ + break; + + case 'a': + if (! IS_SYNTAX_BV(env->syntax, ONIG_SYN_PYTHON)) + return ONIGERR_UNDEFINED_GROUP_OPTION; + + OPTION_NEGATE(option, ONIG_OPTION_POSIX_IS_ASCII, neg); + break; default: return ONIGERR_UNDEFINED_GROUP_OPTION; @@ -8112,7 +8295,7 @@ static const char* ReduceQStr[] = { }; static int -assign_quantifier_body(Node* qnode, Node* target, int group, ScanEnv* env) +assign_quantifier_body(Node* qnode, Node* target, int group, ParseEnv* env) { QuantNode* qn; @@ -8260,35 +8443,38 @@ onig_new_cclass_with_code_list(Node** rnode, OnigEncoding enc, } typedef struct { - ScanEnv* env; + ParseEnv* env; CClassNode* cc; Node* alt_root; Node** ptail; } IApplyCaseFoldArg; static int -i_apply_case_fold(OnigCodePoint from, OnigCodePoint to[], int to_len, void* arg) +i_apply_case_fold(OnigCodePoint from, OnigCodePoint to[], int to_len, + void* arg) { IApplyCaseFoldArg* iarg; - ScanEnv* env; + ParseEnv* env; + OnigEncoding enc; CClassNode* cc; iarg = (IApplyCaseFoldArg* )arg; env = iarg->env; cc = iarg->cc; + enc = env->enc; if (to_len == 1) { - int is_in = onig_is_code_in_cc(env->enc, from, cc); + int is_in = onig_is_code_in_cc(enc, from, cc); #ifdef CASE_FOLD_IS_APPLIED_INSIDE_NEGATIVE_CCLASS if ((is_in != 0 && !IS_NCCLASS_NOT(cc)) || (is_in == 0 && IS_NCCLASS_NOT(cc))) { - ADD_CODE_INTO_CC(cc, *to, env->enc); + ADD_CODE_INTO_CC(cc, *to, enc); } #else if (is_in != 0) { - if (ONIGENC_MBC_MINLEN(env->enc) > 1 || - ONIGENC_CODE_TO_MBCLEN(env->enc, *to) != 1) { - if (IS_NCCLASS_NOT(cc)) clear_not_flag_cclass(cc, env->enc); + if (ONIGENC_MBC_MINLEN(enc) > 1 || + ONIGENC_CODE_TO_MBCLEN(enc, *to) != 1) { + if (IS_NCCLASS_NOT(cc)) clear_not_flag_cclass(cc, enc); add_code_range(&(cc->mbuf), env, *to, *to); } else { @@ -8305,7 +8491,7 @@ i_apply_case_fold(OnigCodePoint from, OnigCodePoint to[], int to_len, void* arg) int r, i, len; UChar buf[ONIGENC_CODE_TO_MBC_MAXLEN]; - if (onig_is_code_in_cc(env->enc, from, cc) + if (onig_is_code_in_cc(enc, from, cc) #ifdef CASE_FOLD_IS_APPLIED_INSIDE_NEGATIVE_CCLASS && !IS_NCCLASS_NOT(cc) #endif @@ -8320,8 +8506,9 @@ i_apply_case_fold(OnigCodePoint from, OnigCodePoint to[], int to_len, void* arg) Node* csnode; CClassNode* cs_cc; - index = onigenc_unicode_fold1_key(&to[i]); - if (index >= 0) { + index = 0; + if (ONIGENC_IS_UNICODE_ENCODING(enc) && + (index = onigenc_unicode_fold1_key(&to[i])) >= 0) { csnode = node_new_cclass(); cs_cc = CCLASS_(csnode); if (IS_NULL(csnode)) { @@ -8332,18 +8519,22 @@ i_apply_case_fold(OnigCodePoint from, OnigCodePoint to[], int to_len, void* arg) m = FOLDS1_UNFOLDS_NUM(index); for (j = 0; j < m; j++) { code = FOLDS1_UNFOLDS(index)[j]; - ADD_CODE_INTO_CC(cs_cc, code, env->enc); + ADD_CODE_INTO_CC(cs_cc, code, enc); } - ADD_CODE_INTO_CC(cs_cc, to[i], env->enc); + ADD_CODE_INTO_CC(cs_cc, to[i], enc); ns[n++] = csnode; } else { - len = ONIGENC_CODE_TO_MBC(env->enc, to[i], buf); + len = ONIGENC_CODE_TO_MBC(enc, to[i], buf); if (n == 0 || NODE_TYPE(ns[n-1]) != NODE_STRING) { csnode = node_new_str(buf, buf + len); if (IS_NULL(csnode)) goto err_free_ns; - NODE_STRING_SET_CASE_EXPANDED(csnode); + if (index == 0) + NODE_STATUS_ADD(csnode, IGNORECASE); + else + NODE_STRING_SET_CASE_EXPANDED(csnode); + ns[n++] = csnode; } else { @@ -8372,7 +8563,7 @@ i_apply_case_fold(OnigCodePoint from, OnigCodePoint to[], int to_len, void* arg) static int prs_exp(Node** np, PToken* tok, int term, UChar** src, UChar* end, - ScanEnv* env, int group_head) + ParseEnv* env, int group_head) { int r, len, group; Node* qn; @@ -8778,7 +8969,7 @@ prs_exp(Node** np, PToken* tok, int term, UChar** src, UChar* end, static int prs_branch(Node** top, PToken* tok, int term, UChar** src, UChar* end, - ScanEnv* env, int group_head) + ParseEnv* env, int group_head) { int r; Node *node, **headp; @@ -8829,7 +9020,7 @@ prs_branch(Node** top, PToken* tok, int term, UChar** src, UChar* end, /* term_tok: TK_EOT or TK_SUBEXP_CLOSE */ static int prs_alts(Node** top, PToken* tok, int term, UChar** src, UChar* end, - ScanEnv* env, int group_head) + ParseEnv* env, int group_head) { int r; Node *node, **headp; @@ -8892,7 +9083,7 @@ prs_alts(Node** top, PToken* tok, int term, UChar** src, UChar* end, } static int -prs_regexp(Node** top, UChar** src, UChar* end, ScanEnv* env) +prs_regexp(Node** top, UChar** src, UChar* end, ParseEnv* env) { int r; PToken tok; @@ -8908,7 +9099,7 @@ prs_regexp(Node** top, UChar** src, UChar* end, ScanEnv* env) #ifdef USE_CALL static int -make_call_zero_body(Node* node, ScanEnv* env, Node** rnode) +make_call_zero_body(Node* node, ParseEnv* env, Node** rnode) { int r; @@ -8930,7 +9121,7 @@ make_call_zero_body(Node* node, ScanEnv* env, Node** rnode) extern int onig_parse_tree(Node** root, const UChar* pattern, const UChar* end, - regex_t* reg, ScanEnv* env) + regex_t* reg, ParseEnv* env) { int r; UChar* p; @@ -8945,7 +9136,6 @@ onig_parse_tree(Node** root, const UChar* pattern, const UChar* end, reg->num_empty_check = 0; reg->repeat_range_alloc = 0; reg->repeat_range = (RepeatRange* )NULL; - reg->empty_status_mem = 0; names_clear(reg); @@ -8990,7 +9180,7 @@ onig_parse_tree(Node** root, const UChar* pattern, const UChar* end, } extern void -onig_scan_env_set_error_string(ScanEnv* env, int ecode ARG_UNUSED, +onig_scan_env_set_error_string(ParseEnv* env, int ecode ARG_UNUSED, UChar* arg, UChar* arg_end) { env->error = arg; diff --git a/src/regparse.h b/src/regparse.h index c60a42d..8875f78 100644 --- a/src/regparse.h +++ b/src/regparse.h @@ -4,7 +4,7 @@ regparse.h - Oniguruma (regular expression library) **********************************************************************/ /*- - * Copyright (c) 2002-2020 K.Kosako + * Copyright (c) 2002-2021 K.Kosako * All rights reserved. * * Redistribution and use in source and binary forms, with or without @@ -73,6 +73,14 @@ enum BodyEmptyType { BODY_MAY_BE_EMPTY_REC = 3 }; +/* bytes buffer */ +typedef struct _BBuf { + UChar* p; + unsigned int used; + unsigned int alloc; +} BBuf; + + struct _Node; typedef struct { @@ -110,6 +118,7 @@ typedef struct { struct _Node* head_exact; struct _Node* next_head_exact; int include_referred; /* include called node. don't eliminate even if {0} */ + MemStatusType empty_status_mem; } QuantNode; typedef struct { @@ -340,6 +349,7 @@ typedef struct { #define NODE_ST_ABSENT_WITH_SIDE_EFFECTS (1<<24) /* stopper or clear */ #define NODE_ST_FIXED_CLEN_MIN_SURE (1<<25) #define NODE_ST_REFERENCED (1<<26) +#define NODE_ST_INPEEK (1<<27) #define NODE_STATUS(node) (((Node* )node)->u.base.status) @@ -376,6 +386,7 @@ typedef struct { #define NODE_IS_ABSENT_WITH_SIDE_EFFECTS(node) ((NODE_STATUS(node) & NODE_ST_ABSENT_WITH_SIDE_EFFECTS) != 0) #define NODE_IS_FIXED_CLEN_MIN_SURE(node) ((NODE_STATUS(node) & NODE_ST_FIXED_CLEN_MIN_SURE) != 0) #define NODE_IS_REFERENCED(node) ((NODE_STATUS(node) & NODE_ST_REFERENCED) != 0) +#define NODE_IS_INPEEK(node) ((NODE_STATUS(node) & NODE_ST_INPEEK) != 0) #define NODE_PARENT(node) ((node)->u.base.parent) #define NODE_BODY(node) ((node)->u.base.body) @@ -384,8 +395,8 @@ typedef struct { #define NODE_CALL_BODY(node) ((node)->body) #define NODE_ANCHOR_BODY(node) ((node)->body) -#define SCANENV_MEMENV_SIZE 8 -#define SCANENV_MEMENV(senv) \ +#define PARSEENV_MEMENV_SIZE 8 +#define PARSEENV_MEMENV(senv) \ (IS_NOT_NULL((senv)->mem_env_dynamic) ? \ (senv)->mem_env_dynamic : (senv)->mem_env_static) @@ -424,7 +435,7 @@ typedef struct { int num_mem; int num_named; int mem_alloc; - MemEnv mem_env_static[SCANENV_MEMENV_SIZE]; + MemEnv mem_env_static[PARSEENV_MEMENV_SIZE]; MemEnv* mem_env_dynamic; int backref_num; int keep_num; @@ -439,14 +450,14 @@ typedef struct { #ifdef ONIG_DEBUG_PARSE unsigned int max_parse_depth; #endif -} ScanEnv; +} ParseEnv; extern int onig_renumber_name_table P_((regex_t* reg, GroupNumMap* map)); extern int onig_strncmp P_((const UChar* s1, const UChar* s2, int n)); extern void onig_strcpy P_((UChar* dest, const UChar* src, const UChar* end)); -extern void onig_scan_env_set_error_string P_((ScanEnv* env, int ecode, UChar* arg, UChar* arg_end)); +extern void onig_scan_env_set_error_string P_((ParseEnv* env, int ecode, UChar* arg, UChar* arg_end)); extern int onig_reduce_nested_quantifier P_((Node* pnode)); extern int onig_node_copy(Node** rcopy, Node* from); extern int onig_node_str_cat P_((Node* node, const UChar* s, const UChar* end)); @@ -460,7 +471,7 @@ extern Node* onig_node_new_str P_((const UChar* s, const UChar* end)); extern Node* onig_node_new_list P_((Node* left, Node* right)); extern Node* onig_node_new_alt P_((Node* left, Node* right)); extern int onig_names_free P_((regex_t* reg)); -extern int onig_parse_tree P_((Node** root, const UChar* pattern, const UChar* end, regex_t* reg, ScanEnv* env)); +extern int onig_parse_tree P_((Node** root, const UChar* pattern, const UChar* end, regex_t* reg, ParseEnv* env)); extern int onig_free_shared_cclass_table P_((void)); extern int onig_is_code_in_cc P_((OnigEncoding enc, OnigCodePoint code, CClassNode* cc)); extern int onig_new_cclass_with_code_list(Node** rnode, OnigEncoding enc, int n, OnigCodePoint codes[]); diff --git a/src/regposix.c b/src/regposix.c index 497ba02..494446f 100644 --- a/src/regposix.c +++ b/src/regposix.c @@ -2,7 +2,7 @@ regposix.c - Oniguruma (regular expression library) **********************************************************************/ /*- - * Copyright (c) 2002-2020 K.Kosako + * Copyright (c) 2002-2021 K.Kosako * All rights reserved. * * Redistribution and use in source and binary forms, with or without @@ -120,6 +120,7 @@ onig2posix_error_code(int code) { ONIGERR_NUMBERED_BACKREF_OR_CALL_NOT_ALLOWED, REG_BADPAT }, { ONIGERR_TOO_BIG_WIDE_CHAR_VALUE, REG_EONIG_BADWC }, { ONIGERR_TOO_LONG_WIDE_CHAR_VALUE, REG_EONIG_BADWC }, + { ONIGERR_UNDEFINED_OPERATOR, REG_BADPAT }, { ONIGERR_INVALID_CODE_POINT_VALUE, REG_EONIG_BADWC }, { ONIGERR_EMPTY_GROUP_NAME, REG_BADPAT }, { ONIGERR_INVALID_GROUP_NAME, REG_BADPAT }, @@ -141,6 +142,7 @@ onig2posix_error_code(int code) { ONIGERR_INVALID_CALLOUT_TAG_NAME, REG_BADPAT }, { ONIGERR_INVALID_CALLOUT_ARG, REG_BADPAT }, { ONIGERR_NOT_SUPPORTED_ENCODING_COMBINATION, REG_EONIG_BADARG }, + { ONIGERR_VERY_INEFFICIENT_PATTERN, REG_BADPAT }, { ONIGERR_LIBRARY_IS_NOT_INITIALIZED, REG_EONIG_INTERNAL } }; diff --git a/src/regsyntax.c b/src/regsyntax.c index 984aac6..8e1c313 100644 --- a/src/regsyntax.c +++ b/src/regsyntax.c @@ -2,7 +2,7 @@ regsyntax.c - Oniguruma (regular expression library) **********************************************************************/ /*- - * Copyright (c) 2002-2020 K.Kosako + * Copyright (c) 2002-2021 K.Kosako * All rights reserved. * * Redistribution and use in source and binary forms, with or without @@ -240,6 +240,35 @@ OnigSyntaxType OnigSyntaxPerl_NG = { } }; +/* Python 3.9 */ +OnigSyntaxType OnigSyntaxPython = { + (( SYN_GNU_REGEX_OP | ONIG_SYN_OP_QMARK_NON_GREEDY | + ONIG_SYN_OP_ESC_OCTAL3 | ONIG_SYN_OP_ESC_X_HEX2 | + ONIG_SYN_OP_ESC_CONTROL_CHARS | + ONIG_SYN_OP_ESC_C_CONTROL ) + & ~ONIG_SYN_OP_ESC_LTGT_WORD_BEGIN_END ) + , ( ONIG_SYN_OP2_QMARK_GROUP_EFFECT | ONIG_SYN_OP2_OPTION_PERL | + ONIG_SYN_OP2_QMARK_LPAREN_IF_ELSE | + ONIG_SYN_OP2_ASTERISK_CALLOUT_NAME | + ONIG_SYN_OP2_ESC_P_BRACE_CHAR_PROPERTY | + ONIG_SYN_OP2_ESC_P_BRACE_CIRCUMFLEX_NOT | + ONIG_SYN_OP2_QMARK_CAPITAL_P_NAME | + ONIG_SYN_OP2_ESC_CAPITAL_K_KEEP | + ONIG_SYN_OP2_ESC_V_VTAB | ONIG_SYN_OP2_ESC_U_HEX4 ) + , ( SYN_GNU_REGEX_BV | ONIG_SYN_ISOLATED_OPTION_CONTINUE_BRANCH | + ONIG_SYN_ALLOW_INTERVAL_LOW_ABBREV | ONIG_SYN_PYTHON ) + , ONIG_OPTION_SINGLELINE + , + { + (OnigCodePoint )'\\' /* esc */ + , (OnigCodePoint )ONIG_INEFFECTIVE_META_CHAR /* anychar '.' */ + , (OnigCodePoint )ONIG_INEFFECTIVE_META_CHAR /* anytime '*' */ + , (OnigCodePoint )ONIG_INEFFECTIVE_META_CHAR /* zero or one time '?' */ + , (OnigCodePoint )ONIG_INEFFECTIVE_META_CHAR /* one or more time '+' */ + , (OnigCodePoint )ONIG_INEFFECTIVE_META_CHAR /* anychar anytime */ + } +}; + extern int diff --git a/src/unicode.c b/src/unicode.c index 6703d4b..efe5f73 100644 --- a/src/unicode.c +++ b/src/unicode.c @@ -2,7 +2,7 @@ unicode.c - Oniguruma (regular expression library) **********************************************************************/ /*- - * Copyright (c) 2002-2019 K.Kosako + * Copyright (c) 2002-2020 K.Kosako * All rights reserved. * * Redistribution and use in source and binary forms, with or without @@ -77,9 +77,8 @@ static const unsigned short EncUNICODE_ISO_8859_1_CtypeTable[256] = { #include "unicode_fold_data.c" extern int -onigenc_unicode_mbc_case_fold(OnigEncoding enc, - OnigCaseFoldType flag ARG_UNUSED, const UChar** pp, const UChar* end, - UChar* fold) +onigenc_unicode_mbc_case_fold(OnigEncoding enc, OnigCaseFoldType flag, + const UChar** pp, const UChar* end, UChar* fold) { const struct ByUnfoldKey* buk; @@ -104,23 +103,27 @@ onigenc_unicode_mbc_case_fold(OnigEncoding enc, } #endif - buk = onigenc_unicode_unfold_key(code); - if (buk != 0) { - if (buk->fold_len == 1) { - return ONIGENC_CODE_TO_MBC(enc, *FOLDS1_FOLD(buk->index), fold); - } - else { - OnigCodePoint* addr; - - FOLDS_FOLD_ADDR_BUK(buk, addr); - rlen = 0; - for (i = 0; i < buk->fold_len; i++) { - OnigCodePoint c = addr[i]; - len = ONIGENC_CODE_TO_MBC(enc, c, fold); - fold += len; - rlen += len; + if (CASE_FOLD_IS_NOT_ASCII_ONLY(flag) || ONIGENC_IS_ASCII_CODE(code)) { + buk = onigenc_unicode_unfold_key(code); + if (buk != 0) { + if (buk->fold_len == 1) { + if (CASE_FOLD_IS_NOT_ASCII_ONLY(flag) || + ONIGENC_IS_ASCII_CODE(*FOLDS1_FOLD(buk->index))) + return ONIGENC_CODE_TO_MBC(enc, *FOLDS1_FOLD(buk->index), fold); + } + else { + OnigCodePoint* addr; + + FOLDS_FOLD_ADDR_BUK(buk, addr); + rlen = 0; + for (i = 0; i < buk->fold_len; i++) { + OnigCodePoint c = addr[i]; + len = ONIGENC_CODE_TO_MBC(enc, c, fold); + fold += len; + rlen += len; + } + return rlen; } - return rlen; } } @@ -131,16 +134,22 @@ onigenc_unicode_mbc_case_fold(OnigEncoding enc, } static int -apply_case_fold1(int from, int to, OnigApplyAllCaseFoldFunc f, void* arg) +apply_case_fold1(OnigCaseFoldType flag, int from, int to, + OnigApplyAllCaseFoldFunc f, void* arg) { int i, j, k, n, r; for (i = from; i < to; ) { OnigCodePoint fold = *FOLDS1_FOLD(i); + if (CASE_FOLD_IS_ASCII_ONLY(flag) && ! ONIGENC_IS_ASCII_CODE(fold)) break; + n = FOLDS1_UNFOLDS_NUM(i); for (j = 0; j < n; j++) { OnigCodePoint unfold = FOLDS1_UNFOLDS(i)[j]; + if (CASE_FOLD_IS_ASCII_ONLY(flag) && ! ONIGENC_IS_ASCII_CODE(unfold)) + continue; + r = (*f)(fold, &unfold, 1, arg); if (r != 0) return r; r = (*f)(unfold, &fold, 1, arg); @@ -148,6 +157,9 @@ apply_case_fold1(int from, int to, OnigApplyAllCaseFoldFunc f, void* arg) for (k = 0; k < j; k++) { OnigCodePoint unfold2 = FOLDS1_UNFOLDS(i)[k]; + if (CASE_FOLD_IS_ASCII_ONLY(flag) && + ! ONIGENC_IS_ASCII_CODE(unfold2)) continue; + r = (*f)(unfold, &unfold2, 1, arg); if (r != 0) return r; r = (*f)(unfold2, &unfold, 1, arg); @@ -225,7 +237,7 @@ onigenc_unicode_apply_all_case_fold(OnigCaseFoldType flag, { int r; - r = apply_case_fold1(0, FOLDS1_NORMAL_END_INDEX, f, arg); + r = apply_case_fold1(flag, 0, FOLDS1_NORMAL_END_INDEX, f, arg); if (r != 0) return r; #ifdef USE_UNICODE_CASE_FOLD_TURKISH_AZERI @@ -246,7 +258,7 @@ onigenc_unicode_apply_all_case_fold(OnigCaseFoldType flag, } else { #endif - r = apply_case_fold1(FOLDS1_NORMAL_END_INDEX, FOLDS1_END_INDEX, f, arg); + r = apply_case_fold1(flag, FOLDS1_NORMAL_END_INDEX, FOLDS1_END_INDEX, f, arg); if (r != 0) return r; #ifdef USE_UNICODE_CASE_FOLD_TURKISH_AZERI } @@ -288,6 +300,9 @@ onigenc_unicode_get_case_fold_codes_by_str(OnigEncoding enc, n = 0; code = ONIGENC_MBC_TO_CODE(enc, p, end); + if (CASE_FOLD_IS_ASCII_ONLY(flag)) { + if (! ONIGENC_IS_ASCII_CODE(code)) return n; + } len = enclen(enc, p); #ifdef USE_UNICODE_CASE_FOLD_TURKISH_AZERI @@ -449,19 +464,26 @@ onigenc_unicode_get_case_fold_codes_by_str(OnigEncoding enc, if (buk1 != 0) { if (buk1->fold_len == 1) { int un; - items[0].byte_len = lens[0]; - items[0].code_len = 1; - items[0].code[0] = *FOLDS1_FOLD(buk1->index); - n++; + + if (CASE_FOLD_IS_NOT_ASCII_ONLY(flag) || + ONIGENC_IS_ASCII_CODE(*FOLDS1_FOLD(buk1->index))) { + items[0].byte_len = lens[0]; + items[0].code_len = 1; + items[0].code[0] = *FOLDS1_FOLD(buk1->index); + n++; + } un = FOLDS1_UNFOLDS_NUM(buk1->index); for (i = 0; i < un; i++) { OnigCodePoint unfold = FOLDS1_UNFOLDS(buk1->index)[i]; if (unfold != orig_codes[0]) { - items[n].byte_len = lens[0]; - items[n].code_len = 1; - items[n].code[0] = unfold; - n++; + if (CASE_FOLD_IS_NOT_ASCII_ONLY(flag) || + ONIGENC_IS_ASCII_CODE(unfold)) { + items[n].byte_len = lens[0]; + items[n].code_len = 1; + items[n].code[0] = unfold; + n++; + } } } } @@ -548,10 +570,13 @@ onigenc_unicode_get_case_fold_codes_by_str(OnigEncoding enc, if (index >= 0) { int m = FOLDS1_UNFOLDS_NUM(index); for (i = 0; i < m; i++) { - items[n].byte_len = lens[0]; - items[n].code_len = 1; - items[n].code[0] = FOLDS1_UNFOLDS(index)[i]; - n++; + code = FOLDS1_UNFOLDS(index)[i]; + if (CASE_FOLD_IS_NOT_ASCII_ONLY(flag)||ONIGENC_IS_ASCII_CODE(code)) { + items[n].byte_len = lens[0]; + items[n].code_len = 1; + items[n].code[0] = code; + n++; + } } } } diff --git a/test-driver b/test-driver index b8521a4..9759384 100755 --- a/test-driver +++ b/test-driver @@ -3,7 +3,7 @@ scriptversion=2018-03-07.03; # UTC -# Copyright (C) 2011-2018 Free Software Foundation, Inc. +# Copyright (C) 2011-2020 Free Software Foundation, Inc. # # This program is free software; you can redistribute it and/or modify # it under the terms of the GNU General Public License as published by @@ -42,11 +42,13 @@ print_usage () { cat <<END Usage: - test-driver --test-name=NAME --log-file=PATH --trs-file=PATH - [--expect-failure={yes|no}] [--color-tests={yes|no}] - [--enable-hard-errors={yes|no}] [--] + test-driver --test-name NAME --log-file PATH --trs-file PATH + [--expect-failure {yes|no}] [--color-tests {yes|no}] + [--enable-hard-errors {yes|no}] [--] TEST-SCRIPT [TEST-SCRIPT-ARGUMENTS] + The '--test-name', '--log-file' and '--trs-file' options are mandatory. +See the GNU Automake documentation for information. END } diff --git a/test/Makefile.am b/test/Makefile.am index f12eebe..36f8dbe 100644 --- a/test/Makefile.am +++ b/test/Makefile.am @@ -1,14 +1,14 @@ ## Makefile.am for Oniguruma lib_onig = ../src/libonig.la -AM_LDFLAGS = -L$(prefix)/lib +AM_LDFLAGS = -L$(libdir) AM_CFLAGS = -Wall -Wno-invalid-source-encoding AM_CPPFLAGS = -I$(top_srcdir)/src if ENABLE_POSIX_API -TESTS = test_utf8 test_syntax testc testp testcu test_regset test_back +TESTS = test_utf8 test_syntax test_options testc testp testcu test_regset test_back else -TESTS = test_utf8 test_syntax testc testcu test_regset test_back +TESTS = test_utf8 test_syntax test_options testc testcu test_regset test_back endif check_PROGRAMS = $(TESTS) @@ -18,6 +18,8 @@ test: test_uchar $(TESTS) @./test_utf8 | grep RESULT @echo "[Oniguruma API, SYNTAX check]" @./test_syntax | grep RESULT + @echo "[Oniguruma API, Options check]" + @./test_options | grep RESULT @echo "[Oniguruma API, EUC-JP check]" @./testc | grep RESULT if ENABLE_POSIX_API @@ -43,6 +45,9 @@ test_utf8_LDADD = $(lib_onig) test_syntax_SOURCES = test_syntax.c test_syntax_LDADD = $(lib_onig) +test_options_SOURCES = test_options.c +test_options_LDADD = $(lib_onig) + testc_SOURCES = testc.c testc_LDADD = $(lib_onig) @@ -62,6 +67,7 @@ test_back_LDADD = $(lib_onig) gcov: make CFLAGS="--coverage" test_utf8 make CFLAGS="--coverage" test_syntax + make CFLAGS="--coverage" test_options make CFLAGS="--coverage" testc if ENABLE_POSIX_API make CFLAGS="--coverage" testp diff --git a/test/test_back.c b/test/test_back.c index 6bf5159..9a6e4a8 100644 --- a/test/test_back.c +++ b/test/test_back.c @@ -1,8 +1,7 @@ /* * test_back.c - * Copyright (c) 2020 K.Kosako + * Copyright (c) 2020-2021 K.Kosako */ -#include "config.h" #ifdef ONIG_ESCAPE_UCHAR_COLLISION #undef ONIG_ESCAPE_UCHAR_COLLISION #endif @@ -141,7 +140,7 @@ static void xe(char* pattern, char* str, int error_no, int line_no) #define x2(p,s,f,t) xx2(p,s,f,t, __LINE__) #define x3(p,s,f,t,m) xx3(p,s,f,t,m, __LINE__) #define n(p,s) xn(p,s, __LINE__) -#define e(p,s,e) xe(p,s,e, __LINE__) +#define e(p,s,en) xe(p,s,en, __LINE__) extern int main(int argc, char* argv[]) { @@ -1331,10 +1330,10 @@ extern int main(int argc, char* argv[]) x2("(?<!v|^t|^a+.*[efg])z", "uabcdfz", 6, 7); x2("((?(a)\\g<1>|b))", "aab", 2, 3); - x2("((?(a)\\g<1>))", "aab", 1, 2); + x2("((?(a)\\g<1>))", "aab", 3, 3); x2("(b(?(a)|\\g<1>))", "bba", 1, 3); e("(()(?(2)\\g<1>))", "", ONIGERR_NEVER_ENDING_RECURSION); - x2("(?(a)(?:b|c))", "ac", 0, 2); + x2("(?(a)(?:b|c))", "ac", 2, 2); n("^(?(a)b|c)", "ac"); x2("(?i)a|b", "B", 0, 1); n("((?i)a|b.)|c", "C"); diff --git a/test/test_options.c b/test/test_options.c new file mode 100644 index 0000000..7010f0f --- /dev/null +++ b/test/test_options.c @@ -0,0 +1,224 @@ +/* + * test_options.c + * Copyright (c) 2020-2021 K.Kosako + */ +#ifdef ONIG_ESCAPE_UCHAR_COLLISION +#undef ONIG_ESCAPE_UCHAR_COLLISION +#endif +#include <stdio.h> + +#include "oniguruma.h" + +#include <string.h> + +#define SLEN(s) strlen(s) + +static int nsucc = 0; +static int nfail = 0; +static int nerror = 0; + +#ifdef __TRUSTINSOFT_ANALYZER__ +static int nall = 0; +#endif + +static FILE* err_file; + +static OnigRegion* region; + +static void xx(OnigOptionType options, char* pattern, char* str, + int from, int to, int mem, int not, int error_no, int line_no) +{ +#ifdef __TRUSTINSOFT_ANALYZER__ + if (nall++ % TIS_TEST_CHOOSE_MAX != TIS_TEST_CHOOSE_CURRENT) return; +#endif + + int r; + regex_t* reg; + OnigErrorInfo einfo; + + r = onig_new(®, (UChar* )pattern, (UChar* )(pattern + SLEN(pattern)), + options, ONIG_ENCODING_UTF8, ONIG_SYNTAX_DEFAULT, &einfo); + if (r) { + char s[ONIG_MAX_ERROR_MESSAGE_LEN]; + + if (error_no == 0) { + onig_error_code_to_str((UChar* )s, r, &einfo); + fprintf(err_file, "ERROR: %s /%s/ #%d\n", s, pattern, line_no); + nerror++; + } + else { + if (r == error_no) { + fprintf(stdout, "OK(ERROR): /%s/ %d #%d\n", pattern, r, line_no); + nsucc++; + } + else { + fprintf(stdout, "FAIL(ERROR): /%s/ '%s', %d, %d #%d\n", pattern, str, + error_no, r, line_no); + nfail++; + } + } + + return ; + } + + r = onig_search(reg, (UChar* )str, (UChar* )(str + SLEN(str)), + (UChar* )str, (UChar* )(str + SLEN(str)), + region, options); + if (r < ONIG_MISMATCH) { + char s[ONIG_MAX_ERROR_MESSAGE_LEN]; + + if (error_no == 0) { + onig_error_code_to_str((UChar* )s, r); + fprintf(err_file, "ERROR: %s /%s/ #%d\n", s, pattern, line_no); + nerror++; + } + else { + if (r == error_no) { + fprintf(stdout, "OK(ERROR): /%s/ '%s', %d #%d\n", + pattern, str, r, line_no); + nsucc++; + } + else { + fprintf(stdout, "FAIL ERROR NO: /%s/ '%s', %d, %d #%d\n", + pattern, str, error_no, r, line_no); + nfail++; + } + } + + return ; + } + + if (r == ONIG_MISMATCH) { + if (not) { + fprintf(stdout, "OK(N): /%s/ '%s' #%d\n", pattern, str, line_no); + nsucc++; + } + else { + fprintf(stdout, "FAIL: /%s/ '%s' #%d\n", pattern, str, line_no); + nfail++; + } + } + else { + if (not) { + fprintf(stdout, "FAIL(N): /%s/ '%s' #%d\n", pattern, str, line_no); + nfail++; + } + else { + if (region->beg[mem] == from && region->end[mem] == to) { + fprintf(stdout, "OK: /%s/ '%s' #%d\n", pattern, str, line_no); + nsucc++; + } + else { + fprintf(stdout, "FAIL: /%s/ '%s' %d-%d : %d-%d #%d\n", pattern, str, + from, to, region->beg[mem], region->end[mem], line_no); + nfail++; + } + } + } + onig_free(reg); +} + +static void xx2(OnigOptionType options, char* pattern, char* str, + int from, int to, int line_no) +{ + xx(options, pattern, str, from, to, 0, 0, 0, line_no); +} + +static void xx3(OnigOptionType options, char* pattern, char* str, + int from, int to, int mem, int line_no) +{ + xx(options, pattern, str, from, to, mem, 0, 0, line_no); +} + +static void xn(OnigOptionType options, char* pattern, char* str, int line_no) +{ + xx(options, pattern, str, 0, 0, 0, 1, 0, line_no); +} + +#if 0 +static void xe(OnigOptionType options, char* pattern, char* str, + int error_no, int line_no) +{ + xx(options, pattern, str, 0, 0, 0, 0, error_no, line_no); +} +#endif + +#define x2(o,p,s,f,t) xx2(o,p,s,f,t, __LINE__) +#define x3(o,p,s,f,t,m) xx3(o,p,s,f,t,m, __LINE__) +#define n(o,p,s) xn(o,p,s, __LINE__) +#define e(o,p,s,en) xe(o,p,s,en, __LINE__) + +#define OIA (ONIG_OPTION_IGNORECASE | ONIG_OPTION_IGNORECASE_IS_ASCII) + +extern int main(int argc, char* argv[]) +{ + OnigEncoding use_encs[1]; + + use_encs[0] = ONIG_ENCODING_UTF8; + onig_initialize(use_encs, sizeof(use_encs)/sizeof(use_encs[0])); + + err_file = stdout; + + region = onig_region_new(); + + x2(ONIG_OPTION_IGNORECASE, "a", "A", 0, 1); + n(ONIG_OPTION_IGNORECASE_IS_ASCII, "a", "A"); + /* KELVIN SIGN */ + x2(ONIG_OPTION_IGNORECASE, "\xe2\x84\xaa", "k", 0, 1); + x2(ONIG_OPTION_IGNORECASE, "k", "\xe2\x84\xaa", 0, 3); + n(OIA, "\xe2\x84\xaa", "k"); + n(OIA, "k", "\xe2\x84\xaa"); + x2(OIA, "a", "a", 0, 1); + x2(OIA, "A", "A", 0, 1); + x2(OIA, "a", "A", 0, 1); + x2(OIA, "A", "a", 0, 1); + x2(OIA, "ABCDEFGHIJKLMNOPQRSTUVWXYZ", "abcdefghijklmnopqrstuvwxyz", 0, 26); + x2(OIA, "abcdefghijklmnopqrstuvwxyz", "ABCDEFGHIJKLMNOPQRSTUVWXYZ", 0, 26); + x2(OIA, "ABCDEFGHIJKLMNOPQRSTUVWXYZ", "ABCabcdefghijklmnopqrstuvwxyz", 3, 29); + x2(OIA, "abcdefghijklmnopqrstuvwxyz", "abcABCDEFGHIJKLMNOPQRSTUVWXYZ", 3, 29); + x3(OIA, "#%(a!;)(b&)", "#%A!;B&", 5, 7, 2); + + x2(ONIG_OPTION_IGNORECASE, "ss", "\xc3\x9f", 0, 2); + x2(ONIG_OPTION_IGNORECASE, "\xc3\x9f", "SS", 0, 2); + n(OIA, "ss", "\xc3\x9f"); + n(OIA, "\xc3\x9f", "ss"); + x2(OIA, "ss", "SS", 0, 2); + x2(OIA, "Ss", "sS", 0, 2); + + n(ONIG_OPTION_NOTBOL, "^ab", "ab"); + n(ONIG_OPTION_NOTBOL, "\\Aab", "ab"); + n(ONIG_OPTION_NOTEOL, "ab$", "ab"); + n(ONIG_OPTION_NOTEOL, "ab\\z", "ab"); + n(ONIG_OPTION_NOTEOL, "ab\\Z", "ab"); + n(ONIG_OPTION_NOTEOL, "ab\\Z", "ab\n"); + + n(ONIG_OPTION_NOT_BEGIN_STRING, "\\Aab", "ab"); + n(ONIG_OPTION_NOT_END_STRING, "ab\\z", "ab"); + n(ONIG_OPTION_NOT_END_STRING, "ab\\Z", "ab"); + n(ONIG_OPTION_NOT_END_STRING, "ab\\Z", "ab\n"); + + x2(ONIG_OPTION_WORD_IS_ASCII, "\\w", "@g", 1, 2); + n(ONIG_OPTION_WORD_IS_ASCII, "\\w", "あ"); + x2(ONIG_OPTION_NONE, "\\d", "1", 0, 3); + n(ONIG_OPTION_DIGIT_IS_ASCII, "\\d", "1"); + x2(ONIG_OPTION_SPACE_IS_ASCII, "\\s", " ", 0, 1); + x2(ONIG_OPTION_NONE, "\\s", " ", 0, 3); + n(ONIG_OPTION_SPACE_IS_ASCII, "\\s", " "); + + x2(ONIG_OPTION_POSIX_IS_ASCII, "\\w\\d\\s", "c3 ", 0, 3); + n(ONIG_OPTION_POSIX_IS_ASCII, "\\w|\\d|\\s", "あ4 "); + + x2(ONIG_OPTION_EXTEND, " abc \n def", "abcdef", 0, 6); + x2(ONIG_OPTION_FIND_LONGEST, "\\w+", "abc defg hij", 4, 8); + x2(ONIG_OPTION_FIND_NOT_EMPTY, "\\w*", "@@@ abc defg hij", 4, 7); + + + fprintf(stdout, + "\nRESULT SUCC: %4d, FAIL: %d, ERROR: %d (by Oniguruma %s)\n", + nsucc, nfail, nerror, onig_version()); + + onig_region_free(region, 1); + onig_end(); + + return ((nfail == 0 && nerror == 0) ? 0 : -1); +} diff --git a/test/test_syntax.c b/test/test_syntax.c index 06fef45..b501ccd 100644 --- a/test/test_syntax.c +++ b/test/test_syntax.c @@ -1,8 +1,7 @@ /* * test_syntax.c - * Copyright (c) 2019-2020 K.Kosako + * Copyright (c) 2019-2021 K.Kosako */ -#include "config.h" #ifdef ONIG_ESCAPE_UCHAR_COLLISION #undef ONIG_ESCAPE_UCHAR_COLLISION #endif @@ -139,7 +138,7 @@ static void e(char* pattern, char* str, int error_no) xx(pattern, str, 0, 0, 0, 0, error_no); } -static int test_fixed_interval() +static int test_reluctant_interval() { x2("a{1,3}?", "aaa", 0, 1); x2("a{3}", "aaa", 0, 3); @@ -148,6 +147,11 @@ static int test_fixed_interval() x2("a{3,3}?", "aaa", 0, 3); n("a{3,3}?", "aa"); + return 0; +} + +static int test_possessive_interval() +{ x2("a{1,3}+", "aaaaaa", 0, 3); x2("a{3}+", "aaaaaa", 0, 3); x2("a{3,3}+", "aaaaaa", 0, 3); @@ -209,6 +213,52 @@ static int test_look_behind() return 0; } +static int test_python_option_ascii() +{ + x2("(?a)\\w", "a", 0, 1); + x2("\\w", "あ", 0, 3); + n("(?a)\\w", "あ"); + x2("\\s", " ", 0, 3); + n("(?a)\\s", " "); + x2("\\d", "5", 0, 3); + n("(?a)\\d", "5"); + x2("あ\\b ", "あ ", 0, 4); + n("(?a)あ\\b ", "あ "); + n("あ\\B ", "あ "); + x2("(?a)あ\\B ", "あ ", 0, 4); + x2("(?a)\\W", "あ", 0, 3); + n("\\W", "あ"); + x2("(?a)\\S", " ", 0, 3); + n("\\S", " "); + x2("(?a)\\D", "5", 0, 3); + n("\\D", "5"); + + return 0; +} + +static int test_python_z() +{ + x2("a\\Z", "a", 0, 1); + n("a\\Z", "a\n"); + e("\\z", "a", ONIGERR_UNDEFINED_OPERATOR); + + return 0; +} + +static int test_python_single_multi() +{ + n(".", "\n"); + x2("(?s).", "\n", 0, 1); + + n("^abc", "\nabc"); + x2("(?m)^abc", "\nabc", 1, 4); + n("abc$", "abc\ndef"); + x2("abc$", "abc\n", 0, 3); + x2("(?m)abc$", "abc\ndef", 0, 3); + + return 0; +} + extern int main(int argc, char* argv[]) { OnigEncoding use_encs[1]; @@ -222,7 +272,8 @@ extern int main(int argc, char* argv[]) Syntax = ONIG_SYNTAX_PERL; - test_fixed_interval(); + test_reluctant_interval(); + test_possessive_interval(); test_isolated_option(); test_prec_read(); test_look_behind(); @@ -235,7 +286,8 @@ extern int main(int argc, char* argv[]) Syntax = ONIG_SYNTAX_JAVA; - test_fixed_interval(); + test_reluctant_interval(); + test_possessive_interval(); test_isolated_option(); test_prec_read(); test_look_behind(); @@ -243,6 +295,21 @@ extern int main(int argc, char* argv[]) n("(?<!ab|b)c", "bbc"); n("(?<!b|ab)c", "bbc"); + Syntax = ONIG_SYNTAX_PYTHON; + + test_reluctant_interval(); + test_python_option_ascii(); + test_python_z(); + test_python_single_multi(); + x2("(?P<name>abc)", "abc", 0, 3); + x2("(?P<name>abc)(?P=name)", "abcabc", 0, 6); + x2("(?P<name>abc){0}(?P>name)", "abc", 0, 3); + x2("(?P<expr>[^()]+|\\((?P>expr)\\)){0}(?P>expr)", "((((xyz))))", 0, 11); + x2("\\u0041", "A", 0, 1); + x2("\\U00000041", "A", 0, 1); + e("\\U0041", "A", ONIGERR_INVALID_CODE_POINT_VALUE); + + fprintf(stdout, "\nRESULT SUCC: %4d, FAIL: %d, ERROR: %d (by Oniguruma %s)\n", nsucc, nfail, nerror, onig_version()); diff --git a/test/test_utf8.c b/test/test_utf8.c index 7a4322d..9822308 100644 --- a/test/test_utf8.c +++ b/test/test_utf8.c @@ -1,8 +1,7 @@ /* * test_utf8.c - * Copyright (c) 2019-2020 K.Kosako + * Copyright (c) 2019-2021 K.Kosako */ -#include "config.h" #ifdef ONIG_ESCAPE_UCHAR_COLLISION #undef ONIG_ESCAPE_UCHAR_COLLISION #endif @@ -27,7 +26,7 @@ static FILE* err_file; static OnigRegion* region; static void xx(char* pattern, char* str, int from, int to, int mem, int not, - int error_no) + int error_no, int line_no) { #ifdef __TRUSTINSOFT_ANALYZER__ if (nall++ % TIS_TEST_CHOOSE_MAX != TIS_TEST_CHOOSE_CURRENT) return; @@ -44,17 +43,17 @@ static void xx(char* pattern, char* str, int from, int to, int mem, int not, if (error_no == 0) { onig_error_code_to_str((UChar* )s, r, &einfo); - fprintf(err_file, "ERROR: %s /%s/\n", s, pattern); + fprintf(err_file, "ERROR: %s /%s/ #%d\n", s, pattern, line_no); nerror++; } else { if (r == error_no) { - fprintf(stdout, "OK(ERROR): /%s/ %d\n", pattern, r); + fprintf(stdout, "OK(ERROR): /%s/ %d #%d\n", pattern, r, line_no); nsucc++; } else { - fprintf(stdout, "FAIL(ERROR): /%s/ '%s', %d, %d\n", pattern, str, - error_no, r); + fprintf(stdout, "FAIL(ERROR): /%s/ '%s', %d, %d #%d\n", pattern, str, + error_no, r, line_no); nfail++; } } @@ -70,17 +69,18 @@ static void xx(char* pattern, char* str, int from, int to, int mem, int not, if (error_no == 0) { onig_error_code_to_str((UChar* )s, r); - fprintf(err_file, "ERROR: %s /%s/\n", s, pattern); + fprintf(err_file, "ERROR: %s /%s/ #%d\n", s, pattern, line_no); nerror++; } else { if (r == error_no) { - fprintf(stdout, "OK(ERROR): /%s/ '%s', %d\n", pattern, str, r); + fprintf(stdout, "OK(ERROR): /%s/ '%s', %d #%d\n", + pattern, str, r, line_no); nsucc++; } else { - fprintf(stdout, "FAIL ERROR NO: /%s/ '%s', %d, %d\n", pattern, str, - error_no, r); + fprintf(stdout, "FAIL ERROR NO: /%s/ '%s', %d, %d #%d\n", + pattern, str, error_no, r, line_no); nfail++; } } @@ -90,27 +90,27 @@ static void xx(char* pattern, char* str, int from, int to, int mem, int not, if (r == ONIG_MISMATCH) { if (not) { - fprintf(stdout, "OK(N): /%s/ '%s'\n", pattern, str); + fprintf(stdout, "OK(N): /%s/ '%s' #%d\n", pattern, str, line_no); nsucc++; } else { - fprintf(stdout, "FAIL: /%s/ '%s'\n", pattern, str); + fprintf(stdout, "FAIL: /%s/ '%s' #%d\n", pattern, str, line_no); nfail++; } } else { if (not) { - fprintf(stdout, "FAIL(N): /%s/ '%s'\n", pattern, str); + fprintf(stdout, "FAIL(N): /%s/ '%s' #%d\n", pattern, str, line_no); nfail++; } else { if (region->beg[mem] == from && region->end[mem] == to) { - fprintf(stdout, "OK: /%s/ '%s'\n", pattern, str); + fprintf(stdout, "OK: /%s/ '%s' #%d\n", pattern, str, line_no); nsucc++; } else { - fprintf(stdout, "FAIL: /%s/ '%s' %d-%d : %d-%d\n", pattern, str, - from, to, region->beg[mem], region->end[mem]); + fprintf(stdout, "FAIL: /%s/ '%s' %d-%d : %d-%d #%d\n", pattern, str, + from, to, region->beg[mem], region->end[mem], line_no); nfail++; } } @@ -118,26 +118,31 @@ static void xx(char* pattern, char* str, int from, int to, int mem, int not, onig_free(reg); } -static void x2(char* pattern, char* str, int from, int to) +static void xx2(char* pattern, char* str, int from, int to, int line_no) { - xx(pattern, str, from, to, 0, 0, 0); + xx(pattern, str, from, to, 0, 0, 0, line_no); } -static void x3(char* pattern, char* str, int from, int to, int mem) +static void xx3(char* pattern, char* str, int from, int to, int mem, int line_no) { - xx(pattern, str, from, to, mem, 0, 0); + xx(pattern, str, from, to, mem, 0, 0, line_no); } -static void n(char* pattern, char* str) +static void xn(char* pattern, char* str, int line_no) { - xx(pattern, str, 0, 0, 0, 1, 0); + xx(pattern, str, 0, 0, 0, 1, 0, line_no); } -static void e(char* pattern, char* str, int error_no) +static void xe(char* pattern, char* str, int error_no, int line_no) { - xx(pattern, str, 0, 0, 0, 0, error_no); + xx(pattern, str, 0, 0, 0, 0, error_no, line_no); } +#define x2(p,s,f,t) xx2(p,s,f,t, __LINE__) +#define x3(p,s,f,t,m) xx3(p,s,f,t,m, __LINE__) +#define n(p,s) xn(p,s, __LINE__) +#define e(p,s,en) xe(p,s,en, __LINE__) + extern int main(int argc, char* argv[]) { OnigEncoding use_encs[1]; @@ -359,6 +364,114 @@ extern int main(int argc, char* argv[]) x2("(.*)a\\1f", "bacbabf", 3, 7); x2("((.*)a\\2f)", "bacbabf", 3, 7); x2("(.*)a\\1f", "baczzzzzz\nbazz\nzzzzbabf", 19, 23); + x2("(?:x?)?", "", 0, 0); + x2("(?:x?)?", "x", 0, 1); + x2("(?:x?)?", "xx", 0, 1); + x2("(?:x?)*", "", 0, 0); + x2("(?:x?)*", "x", 0, 1); + x2("(?:x?)*", "xx", 0, 2); + x2("(?:x?)+", "", 0, 0); + x2("(?:x?)+", "x", 0, 1); + x2("(?:x?)+", "xx", 0, 2); + x2("(?:x?)\?\?", "", 0, 0); + x2("(?:x?)\?\?", "x", 0, 0); + x2("(?:x?)\?\?", "xx", 0, 0); + x2("(?:x?)*?", "", 0, 0); + x2("(?:x?)*?", "x", 0, 0); + x2("(?:x?)*?", "xx", 0, 0); + x2("(?:x?)+?", "", 0, 0); + x2("(?:x?)+?", "x", 0, 1); + x2("(?:x?)+?", "xx", 0, 1); + x2("(?:x*)?", "", 0, 0); + x2("(?:x*)?", "x", 0, 1); + x2("(?:x*)?", "xx", 0, 2); + x2("(?:x*)*", "", 0, 0); + x2("(?:x*)*", "x", 0, 1); + x2("(?:x*)*", "xx", 0, 2); + x2("(?:x*)+", "", 0, 0); + x2("(?:x*)+", "x", 0, 1); + x2("(?:x*)+", "xx", 0, 2); + x2("(?:x*)\?\?", "", 0, 0); + x2("(?:x*)\?\?", "x", 0, 0); + x2("(?:x*)\?\?", "xx", 0, 0); + x2("(?:x*)*?", "", 0, 0); + x2("(?:x*)*?", "x", 0, 0); + x2("(?:x*)*?", "xx", 0, 0); + x2("(?:x*)+?", "", 0, 0); + x2("(?:x*)+?", "x", 0, 1); + x2("(?:x*)+?", "xx", 0, 2); + x2("(?:x+)?", "", 0, 0); + x2("(?:x+)?", "x", 0, 1); + x2("(?:x+)?", "xx", 0, 2); + x2("(?:x+)*", "", 0, 0); + x2("(?:x+)*", "x", 0, 1); + x2("(?:x+)*", "xx", 0, 2); + n("(?:x+)+", ""); + x2("(?:x+)+", "x", 0, 1); + x2("(?:x+)+", "xx", 0, 2); + x2("(?:x+)\?\?", "", 0, 0); + x2("(?:x+)\?\?", "x", 0, 0); + x2("(?:x+)\?\?", "xx", 0, 0); + x2("(?:x+)*?", "", 0, 0); + x2("(?:x+)*?", "x", 0, 0); + x2("(?:x+)*?", "xx", 0, 0); + n("(?:x+)+?", ""); + x2("(?:x+)+?", "x", 0, 1); + x2("(?:x+)+?", "xx", 0, 2); + x2("(?:x\?\?)?", "", 0, 0); + x2("(?:x\?\?)?", "x", 0, 0); + x2("(?:x\?\?)?", "xx", 0, 0); + x2("(?:x\?\?)*", "", 0, 0); + x2("(?:x\?\?)*", "x", 0, 0); + x2("(?:x\?\?)*", "xx", 0, 0); + x2("(?:x\?\?)+", "", 0, 0); + x2("(?:x\?\?)+", "x", 0, 0); + x2("(?:x\?\?)+", "xx", 0, 0); + x2("(?:x\?\?)\?\?", "", 0, 0); + x2("(?:x\?\?)\?\?", "x", 0, 0); + x2("(?:x\?\?)\?\?", "xx", 0, 0); + x2("(?:x\?\?)*?", "", 0, 0); + x2("(?:x\?\?)*?", "x", 0, 0); + x2("(?:x\?\?)*?", "xx", 0, 0); + x2("(?:x\?\?)+?", "", 0, 0); + x2("(?:x\?\?)+?", "x", 0, 0); + x2("(?:x\?\?)+?", "xx", 0, 0); + x2("(?:x*?)?", "", 0, 0); + x2("(?:x*?)?", "x", 0, 0); + x2("(?:x*?)?", "xx", 0, 0); + x2("(?:x*?)*", "", 0, 0); + x2("(?:x*?)*", "x", 0, 0); + x2("(?:x*?)*", "xx", 0, 0); + x2("(?:x*?)+", "", 0, 0); + x2("(?:x*?)+", "x", 0, 0); + x2("(?:x*?)+", "xx", 0, 0); + x2("(?:x*?)\?\?", "", 0, 0); + x2("(?:x*?)\?\?", "x", 0, 0); + x2("(?:x*?)\?\?", "xx", 0, 0); + x2("(?:x*?)*?", "", 0, 0); + x2("(?:x*?)*?", "x", 0, 0); + x2("(?:x*?)*?", "xx", 0, 0); + x2("(?:x*?)+?", "", 0, 0); + x2("(?:x*?)+?", "x", 0, 0); + x2("(?:x*?)+?", "xx", 0, 0); + x2("(?:x+?)?", "", 0, 0); + x2("(?:x+?)?", "x", 0, 1); + x2("(?:x+?)?", "xx", 0, 1); + x2("(?:x+?)*", "", 0, 0); + x2("(?:x+?)*", "x", 0, 1); + x2("(?:x+?)*", "xx", 0, 2); + n("(?:x+?)+", ""); + x2("(?:x+?)+", "x", 0, 1); + x2("(?:x+?)+", "xx", 0, 2); + x2("(?:x+?)\?\?", "", 0, 0); + x2("(?:x+?)\?\?", "x", 0, 0); + x2("(?:x+?)\?\?", "xx", 0, 0); + x2("(?:x+?)*?", "", 0, 0); + x2("(?:x+?)*?", "x", 0, 0); + x2("(?:x+?)*?", "xx", 0, 0); + n("(?:x+?)+?", ""); + x2("(?:x+?)+?", "x", 0, 1); + x2("(?:x+?)+?", "xx", 0, 1); x2("a|b", "a", 0, 1); x2("a|b", "b", 0, 1); x2("|a", "a", 0, 0); @@ -1348,9 +1461,12 @@ extern int main(int argc, char* argv[]) x2("((?(a)\\g<1>|b))", "aab", 0, 3); x2("((?(a)\\g<1>))", "aab", 0, 2); + x2("((?(a)\\g<1>))", "", 0, 0); x2("(b(?(a)|\\g<1>))", "bba", 0, 3); e("(()(?(2)\\g<1>))", "", ONIGERR_NEVER_ENDING_RECURSION); x2("(?(a)(?:b|c))", "ac", 0, 2); + x2("(?(a)(?:b|c))", "", 0, 0); + x2("(?(a)b)", "", 0, 0); n("^(?(a)b|c)", "ac"); x2("(?i)a|b", "B", 0, 1); n("((?i)a|b.)|c", "C"); @@ -1479,6 +1595,7 @@ extern int main(int argc, char* argv[]) e("[\\x61-\\x{0063-0065}]+", "", ONIGERR_INVALID_CODE_POINT_VALUE); x2("[t\\x{0063 0071}]+", "tcqb", 0, 3); x2("[\\W\\x{0063 0071}]+", "*cqa", 0, 3); + x2("(\\O|(?=z\\g<2>*))(\\g<0>){0}", "a", 0, 1); n("a(b|)+d", "abbbbbbbbbbbbbbbbbbbbbbbbbbbbbbcd"); /* https://www.haijin-boys.com/discussions/5079 */ n(" \xfd", ""); /* https://bugs.php.net/bug.php?id=77370 */ @@ -1491,6 +1608,7 @@ extern int main(int argc, char* argv[]) n("(?x)\n (?<!\\+\\+|--)(?<=[({\\[,?=>:*]|&&|\\|\\||\\?|\\*\\/|^await|[^\\._$[:alnum:]]await|^return|[^\\._$[:alnum:]]return|^default|[^\\._$[:alnum:]]default|^yield|[^\\._$[:alnum:]]yield|^)\\s*\n (?!<\\s*[_$[:alpha:]][_$[:alnum:]]*((\\s+extends\\s+[^=>])|,)) # look ahead is not type parameter of arrow\n (?=(<)\\s*(?:([_$[:alpha:]][-_$[:alnum:].]*)(?<!\\.|-)(:))?((?:[a-z][a-z0-9]*|([_$[:alpha:]][-_$[:alnum:].]*))(?<!\\.|-))(?=((<\\s*)|(\\s+))(?!\\?)|\\/?>))", " while (i < len && f(array[i]))"); /* Issue #192 */ x2("aaaaaaaaaaaaaaaaaaaaaaaあb", "aaaaaaaaaaaaaaaaaaaaaaaあb", 0, 27); /* Issue #221 */ + n("d{65538}+{61533} ", "d{65538}+{61533} "); e("x{55380}{77590}", "", ONIGERR_TOO_BIG_NUMBER_FOR_REPEAT_RANGE); e("(xyz){40000}{99999}(?<name>vv)", "", ONIGERR_TOO_BIG_NUMBER_FOR_REPEAT_RANGE); diff --git a/test/testc.c b/test/testc.c index b3a34ea..5f7c4f0 100644 --- a/test/testc.c +++ b/test/testc.c @@ -1,8 +1,7 @@ /* * testc.c - * Copyright (c) 2019-2020 K.Kosako + * Copyright (c) 2019-2021 K.Kosako */ -#include "config.h" #include <stdio.h> #include <string.h> diff --git a/test/testp.c b/test/testp.c index b88d0e3..3158925 100644 --- a/test/testp.c +++ b/test/testp.c @@ -1,8 +1,7 @@ /* * testp.c - * Copyright (c) 2020 K.Kosako + * Copyright (c) 2020-2021 K.Kosako */ -#include "config.h" #include <stdio.h> #include <string.h> |