From 40f3d0030e6e98bcb02d6523e5ee48497dec49a6 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?J=C3=B6rg=20Frings-F=C3=BCrst?= Date: Wed, 7 Aug 2019 09:32:48 +0200 Subject: New upstream version 6.9.3 --- src/gb18030.c | 6 +- src/oniguruma.h | 11 +++- src/regcomp.c | 156 +++++++++++++++++++++++++++------------------- src/regenc.c | 2 + src/regerror.c | 17 +++++ src/regexec.c | 130 ++++++++++++++++++++++++++------------ src/regext.c | 6 +- src/regint.h | 6 +- src/regparse.c | 190 +++++++++++++++++++++++++++++++++----------------------- src/regparse.h | 22 +++---- src/utf16_be.c | 35 +++++++++-- src/utf16_le.c | 26 ++++++-- 12 files changed, 393 insertions(+), 214 deletions(-) (limited to 'src') diff --git a/src/gb18030.c b/src/gb18030.c index 7654432..8d415b0 100644 --- a/src/gb18030.c +++ b/src/gb18030.c @@ -2,7 +2,7 @@ gb18030.c - Oniguruma (regular expression library) **********************************************************************/ /*- - * Copyright (c) 2005-2018 KUBO Takehiro + * Copyright (c) 2005-2019 KUBO Takehiro * K.Kosako * All rights reserved. * @@ -67,11 +67,11 @@ gb18030_mbc_enc_len(const UChar* p) { if (GB18030_MAP[*p] != CM) return 1; + p++; if (GB18030_MAP[*p] == C4) return 4; - if (GB18030_MAP[*p] == C1) - return 1; /* illegal sequence */ + return 2; } diff --git a/src/oniguruma.h b/src/oniguruma.h index f6aa5ba..90cf2d9 100644 --- a/src/oniguruma.h +++ b/src/oniguruma.h @@ -36,9 +36,9 @@ extern "C" { #define ONIGURUMA #define ONIGURUMA_VERSION_MAJOR 6 #define ONIGURUMA_VERSION_MINOR 9 -#define ONIGURUMA_VERSION_TEENY 2 +#define ONIGURUMA_VERSION_TEENY 3 -#define ONIGURUMA_VERSION_INT 60902 +#define ONIGURUMA_VERSION_INT 60903 #ifndef P_ #if defined(__STDC__) || defined(_WIN32) @@ -52,6 +52,7 @@ extern "C" { # define PV_(args) args #endif +#ifndef ONIG_STATIC #ifndef ONIG_EXTERN #if defined(_WIN32) && !defined(__GNUC__) #if defined(ONIGURUMA_EXPORT) @@ -65,6 +66,9 @@ extern "C" { #ifndef ONIG_EXTERN #define ONIG_EXTERN extern #endif +#else +#define ONIG_EXTERN extern +#endif /* PART: character encoding */ @@ -517,6 +521,7 @@ ONIG_EXTERN OnigSyntaxType* OnigDefaultSyntax; #define ONIG_SYN_BACKSLASH_ESCAPE_IN_CC (1U<<21) /* [..\w..] etc.. */ #define ONIG_SYN_ALLOW_EMPTY_RANGE_IN_CC (1U<<22) #define ONIG_SYN_ALLOW_DOUBLE_RANGE_OP_IN_CC (1U<<23) /* [0-9-a]=[0-9\-a] */ +#define ONIG_SYN_ALLOW_INVALID_CODE_END_OF_RANGE_IN_CC (1U<<26) /* syntax (behavior) warning */ #define ONIG_SYN_WARN_CC_OP_NOT_ESCAPED (1U<<24) /* [,-,] */ #define ONIG_SYN_WARN_REDUNDANT_NESTED_REPEAT (1U<<25) /* (?:a*)+ */ @@ -766,6 +771,8 @@ int onig_init P_((void)); ONIG_EXTERN int onig_error_code_to_str PV_((OnigUChar* s, int err_code, ...)); ONIG_EXTERN +int onig_is_error_code_needs_param PV_((int code)); +ONIG_EXTERN void onig_set_warn_func P_((OnigWarnFunc f)); ONIG_EXTERN void onig_set_verb_warn_func P_((OnigWarnFunc f)); diff --git a/src/regcomp.c b/src/regcomp.c index c2c04a4..b96c793 100644 --- a/src/regcomp.c +++ b/src/regcomp.c @@ -599,12 +599,34 @@ select_str_opcode(int mb_len, int str_len, int ignore_case) } static int -compile_tree_empty_check(Node* node, regex_t* reg, int empty_info, ScanEnv* env) +is_strict_real_node(Node* node) +{ + switch (NODE_TYPE(node)) { + case NODE_STRING: + { + StrNode* sn = STR_(node); + return (sn->end != sn->s); + } + break; + + case NODE_CCLASS: + case NODE_CTYPE: + return 1; + break; + + default: + return 0; + break; + } +} + +static int +compile_tree_empty_check(Node* node, regex_t* reg, int emptiness, ScanEnv* env) { int r; int saved_num_null_check = reg->num_null_check; - if (empty_info != BODY_IS_NOT_EMPTY) { + if (emptiness != BODY_IS_NOT_EMPTY) { r = add_op(reg, OP_EMPTY_CHECK_START); if (r != 0) return r; COP(reg)->empty_check_start.mem = reg->num_null_check; /* NULL CHECK ID */ @@ -614,12 +636,12 @@ compile_tree_empty_check(Node* node, regex_t* reg, int empty_info, ScanEnv* env) r = compile_tree(node, reg, env); if (r != 0) return r; - if (empty_info != BODY_IS_NOT_EMPTY) { - if (empty_info == BODY_IS_EMPTY) + if (emptiness != BODY_IS_NOT_EMPTY) { + if (emptiness == BODY_IS_EMPTY_POSSIBILITY) r = add_op(reg, OP_EMPTY_CHECK_END); - else if (empty_info == BODY_IS_EMPTY_MEM) + else if (emptiness == BODY_IS_EMPTY_POSSIBILITY_MEM) r = add_op(reg, OP_EMPTY_CHECK_END_MEMST); - else if (empty_info == BODY_IS_EMPTY_REC) + else if (emptiness == BODY_IS_EMPTY_POSSIBILITY_REC) r = add_op(reg, OP_EMPTY_CHECK_END_MEMST_PUSH); if (r != 0) return r; @@ -895,12 +917,12 @@ entry_repeat_range(regex_t* reg, int id, int lower, int upper) } p[id].lower = lower; - p[id].upper = (IS_REPEAT_INFINITE(upper) ? 0x7fffffff : upper); + p[id].upper = (IS_INFINITE_REPEAT(upper) ? 0x7fffffff : upper); return 0; } static int -compile_range_repeat_node(QuantNode* qn, int target_len, int empty_info, +compile_range_repeat_node(QuantNode* qn, int target_len, int emptiness, regex_t* reg, ScanEnv* env) { int r; @@ -915,7 +937,7 @@ compile_range_repeat_node(QuantNode* qn, int target_len, int empty_info, r = entry_repeat_range(reg, num_repeat, qn->lower, qn->upper); if (r != 0) return r; - r = compile_tree_empty_check(NODE_QUANT_BODY(qn), reg, empty_info, env); + r = compile_tree_empty_check(NODE_QUANT_BODY(qn), reg, emptiness, env); if (r != 0) return r; if ( @@ -937,7 +959,7 @@ compile_range_repeat_node(QuantNode* qn, int target_len, int empty_info, static int is_anychar_infinite_greedy(QuantNode* qn) { - if (qn->greedy && IS_REPEAT_INFINITE(qn->upper) && + if (qn->greedy && IS_INFINITE_REPEAT(qn->upper) && NODE_IS_ANYCHAR(NODE_QUANT_BODY(qn))) return 1; else @@ -951,8 +973,8 @@ static int compile_length_quantifier_node(QuantNode* qn, regex_t* reg) { int len, mod_tlen; - int infinite = IS_REPEAT_INFINITE(qn->upper); - enum BodyEmpty empty_info = qn->empty_info; + int infinite = IS_INFINITE_REPEAT(qn->upper); + enum BodyEmptyType emptiness = qn->emptiness; int tlen = compile_length_tree(NODE_QUANT_BODY(qn), reg); if (tlen < 0) return tlen; @@ -969,10 +991,9 @@ compile_length_quantifier_node(QuantNode* qn, regex_t* reg) } } - if (empty_info == BODY_IS_NOT_EMPTY) - mod_tlen = tlen; - else - mod_tlen = tlen + (SIZE_OP_EMPTY_CHECK_START + SIZE_OP_EMPTY_CHECK_END); + mod_tlen = tlen; + if (emptiness != BODY_IS_NOT_EMPTY) + mod_tlen += SIZE_OP_EMPTY_CHECK_START + SIZE_OP_EMPTY_CHECK_END; if (infinite && (qn->lower <= 1 || @@ -1026,8 +1047,8 @@ static int compile_quantifier_node(QuantNode* qn, regex_t* reg, ScanEnv* env) { int i, r, mod_tlen; - int infinite = IS_REPEAT_INFINITE(qn->upper); - enum BodyEmpty empty_info = qn->empty_info; + int infinite = IS_INFINITE_REPEAT(qn->upper); + enum BodyEmptyType emptiness = qn->emptiness; int tlen = compile_length_tree(NODE_QUANT_BODY(qn), reg); if (tlen < 0) return tlen; @@ -1055,10 +1076,9 @@ compile_quantifier_node(QuantNode* qn, regex_t* reg, ScanEnv* env) } } - if (empty_info == BODY_IS_NOT_EMPTY) - mod_tlen = tlen; - else - mod_tlen = tlen + (SIZE_OP_EMPTY_CHECK_START + SIZE_OP_EMPTY_CHECK_END); + mod_tlen = tlen; + if (emptiness != BODY_IS_NOT_EMPTY) + mod_tlen += SIZE_OP_EMPTY_CHECK_START + SIZE_OP_EMPTY_CHECK_END; if (infinite && (qn->lower <= 1 || @@ -1096,7 +1116,7 @@ compile_quantifier_node(QuantNode* qn, regex_t* reg, ScanEnv* env) COP(reg)->push_or_jump_exact1.addr = SIZE_INC_OP + mod_tlen + SIZE_OP_JUMP; COP(reg)->push_or_jump_exact1.c = STR_(qn->head_exact)->s[0]; - r = compile_tree_empty_check(NODE_QUANT_BODY(qn), reg, empty_info, env); + r = compile_tree_empty_check(NODE_QUANT_BODY(qn), reg, emptiness, env); if (r != 0) return r; addr = -(mod_tlen + (int )SIZE_OP_PUSH_OR_JUMP_EXACT1); @@ -1109,7 +1129,7 @@ compile_quantifier_node(QuantNode* qn, regex_t* reg, ScanEnv* env) COP(reg)->push_if_peek_next.addr = SIZE_INC_OP + mod_tlen + SIZE_OP_JUMP; COP(reg)->push_if_peek_next.c = STR_(qn->next_head_exact)->s[0]; - r = compile_tree_empty_check(NODE_QUANT_BODY(qn), reg, empty_info, env); + r = compile_tree_empty_check(NODE_QUANT_BODY(qn), reg, emptiness, env); if (r != 0) return r; addr = -(mod_tlen + (int )SIZE_OP_PUSH_IF_PEEK_NEXT); @@ -1119,7 +1139,7 @@ compile_quantifier_node(QuantNode* qn, regex_t* reg, ScanEnv* env) if (r != 0) return r; COP(reg)->push.addr = SIZE_INC_OP + mod_tlen + SIZE_OP_JUMP; - r = compile_tree_empty_check(NODE_QUANT_BODY(qn), reg, empty_info, env); + r = compile_tree_empty_check(NODE_QUANT_BODY(qn), reg, emptiness, env); if (r != 0) return r; addr = -(mod_tlen + (int )SIZE_OP_PUSH); @@ -1134,7 +1154,7 @@ compile_quantifier_node(QuantNode* qn, regex_t* reg, ScanEnv* env) if (r != 0) return r; COP(reg)->jump.addr = mod_tlen + SIZE_INC_OP; - r = compile_tree_empty_check(NODE_QUANT_BODY(qn), reg, empty_info, env); + r = compile_tree_empty_check(NODE_QUANT_BODY(qn), reg, emptiness, env); if (r != 0) return r; r = add_op(reg, OP_PUSH); @@ -1188,7 +1208,7 @@ compile_quantifier_node(QuantNode* qn, regex_t* reg, ScanEnv* env) r = compile_tree(NODE_QUANT_BODY(qn), reg, env); } else { - r = compile_range_repeat_node(qn, mod_tlen, empty_info, reg, env); + r = compile_range_repeat_node(qn, mod_tlen, emptiness, reg, env); } return r; } @@ -1273,7 +1293,7 @@ compile_length_bag_node(BagNode* node, regex_t* reg) break; case BAG_STOP_BACKTRACK: - if (NODE_IS_STOP_BT_SIMPLE_REPEAT(node)) { + if (NODE_IS_STRICT_REAL_REPEAT(node)) { int v; QuantNode* qn; @@ -1307,8 +1327,9 @@ compile_length_bag_node(BagNode* node, regex_t* reg) len += tlen; } + len += SIZE_OP_JUMP + SIZE_OP_ATOMIC_END; + if (IS_NOT_NULL(Else)) { - len += SIZE_OP_JUMP; tlen = compile_length_tree(Else, reg); if (tlen < 0) return tlen; len += tlen; @@ -1423,7 +1444,7 @@ compile_bag_node(BagNode* node, regex_t* reg, ScanEnv* env) break; case BAG_STOP_BACKTRACK: - if (NODE_IS_STOP_BT_SIMPLE_REPEAT(node)) { + if (NODE_IS_STRICT_REAL_REPEAT(node)) { QuantNode* qn = QUANT_(NODE_BAG_BODY(node)); r = compile_tree_n_times(NODE_QUANT_BODY(qn), qn->lower, reg, env); if (r != 0) return r; @@ -1455,7 +1476,7 @@ compile_bag_node(BagNode* node, regex_t* reg, ScanEnv* env) case BAG_IF_ELSE: { - int cond_len, then_len, jump_len; + int cond_len, then_len, else_len, jump_len; Node* cond = NODE_BAG_BODY(node); Node* Then = node->te.Then; Node* Else = node->te.Else; @@ -1472,8 +1493,7 @@ compile_bag_node(BagNode* node, regex_t* reg, ScanEnv* env) else then_len = 0; - jump_len = cond_len + then_len + SIZE_OP_ATOMIC_END; - if (IS_NOT_NULL(Else)) jump_len += SIZE_OP_JUMP; + jump_len = cond_len + then_len + SIZE_OP_ATOMIC_END + SIZE_OP_JUMP; r = add_op(reg, OP_PUSH); if (r != 0) return r; @@ -1490,11 +1510,20 @@ compile_bag_node(BagNode* node, regex_t* reg, ScanEnv* env) } if (IS_NOT_NULL(Else)) { - int else_len = compile_length_tree(Else, reg); - r = add_op(reg, OP_JUMP); - if (r != 0) return r; - COP(reg)->jump.addr = else_len + SIZE_INC_OP; + else_len = compile_length_tree(Else, reg); + if (else_len < 0) return else_len; + } + else + else_len = 0; + + r = add_op(reg, OP_JUMP); + if (r != 0) return r; + COP(reg)->jump.addr = SIZE_OP_ATOMIC_END + else_len + SIZE_INC_OP; + r = add_op(reg, OP_ATOMIC_END); + if (r != 0) return r; + + if (IS_NOT_NULL(Else)) { r = compile_tree(Else, reg, env); } } @@ -3035,7 +3064,7 @@ tree_max_len(Node* node, ScanEnv* env) if (qn->upper != 0) { len = tree_max_len(NODE_BODY(node), env); if (len != 0) { - if (! IS_REPEAT_INFINITE(qn->upper)) + if (! IS_INFINITE_REPEAT(qn->upper)) len = distance_multiply(len, qn->upper); else len = INFINITE_LEN; @@ -3581,7 +3610,7 @@ next_setup(Node* node, Node* next_node, regex_t* reg) type = NODE_TYPE(node); if (type == NODE_QUANT) { QuantNode* qn = QUANT_(node); - if (qn->greedy && IS_REPEAT_INFINITE(qn->upper)) { + if (qn->greedy && IS_INFINITE_REPEAT(qn->upper)) { #ifdef USE_QUANT_PEEK_NEXT Node* n = get_head_value_node(next_node, 1, reg); /* '\0': for UTF-16BE etc... */ @@ -3591,7 +3620,7 @@ next_setup(Node* node, Node* next_node, regex_t* reg) #endif /* automatic posseivation a*b ==> (?>a*)b */ if (qn->lower <= 1) { - if (NODE_IS_SIMPLE_TYPE(NODE_BODY(node))) { + if (is_strict_real_node(NODE_BODY(node))) { Node *x, *y; x = get_head_value_node(NODE_BODY(node), 0, reg); if (IS_NOT_NULL(x)) { @@ -3599,7 +3628,7 @@ next_setup(Node* node, Node* next_node, regex_t* reg) if (IS_NOT_NULL(y) && is_exclusive(x, y, reg)) { Node* en = onig_node_new_bag(BAG_STOP_BACKTRACK); CHECK_NULL_RETURN_MEMERR(en); - NODE_STATUS_ADD(en, STOP_BT_SIMPLE_REPEAT); + NODE_STATUS_ADD(en, STRICT_REAL_REPEAT); swap_node(node, en); NODE_BODY(node) = en; } @@ -4001,11 +4030,11 @@ expand_case_fold_string(Node* node, regex_t* reg, int state) return r; } -#ifdef USE_INSISTENT_CHECK_CAPTURES_IN_EMPTY_REPEAT -static enum BodyEmpty +#ifdef USE_STUBBORN_CHECK_CAPTURES_IN_EMPTY_REPEAT +static enum BodyEmptyType quantifiers_memory_node_info(Node* node) { - int r = BODY_IS_EMPTY; + int r = BODY_IS_EMPTY_POSSIBILITY; switch (NODE_TYPE(node)) { case NODE_LIST: @@ -4022,7 +4051,7 @@ quantifiers_memory_node_info(Node* node) #ifdef USE_CALL case NODE_CALL: if (NODE_IS_RECURSION(node)) { - return BODY_IS_EMPTY_REC; /* tiny version */ + return BODY_IS_EMPTY_POSSIBILITY_REC; /* tiny version */ } else r = quantifiers_memory_node_info(NODE_BODY(node)); @@ -4044,9 +4073,9 @@ quantifiers_memory_node_info(Node* node) switch (en->type) { case BAG_MEMORY: if (NODE_IS_RECURSION(node)) { - return BODY_IS_EMPTY_REC; + return BODY_IS_EMPTY_POSSIBILITY_REC; } - return BODY_IS_EMPTY_MEM; + return BODY_IS_EMPTY_POSSIBILITY_MEM; break; case BAG_OPTION: @@ -4083,7 +4112,7 @@ quantifiers_memory_node_info(Node* node) return r; } -#endif /* USE_INSISTENT_CHECK_CAPTURES_IN_EMPTY_REPEAT */ +#endif /* USE_STUBBORN_CHECK_CAPTURES_IN_EMPTY_REPEAT */ #ifdef USE_CALL @@ -4351,7 +4380,7 @@ setup_called_state_call(Node* node, int state) { QuantNode* qn = QUANT_(node); - if (IS_REPEAT_INFINITE(qn->upper) || qn->upper >= 2) + if (IS_INFINITE_REPEAT(qn->upper) || qn->upper >= 2) state |= IN_REAL_REPEAT; if (qn->lower != qn->upper) state |= IN_VAR_REPEAT; @@ -4468,7 +4497,7 @@ setup_called_state(Node* node, int state) { QuantNode* qn = QUANT_(node); - if (IS_REPEAT_INFINITE(qn->upper) || qn->upper >= 2) + if (IS_INFINITE_REPEAT(qn->upper) || qn->upper >= 2) state |= IN_REAL_REPEAT; if (qn->lower != qn->upper) state |= IN_VAR_REPEAT; @@ -4600,24 +4629,24 @@ setup_quant(Node* node, regex_t* reg, int state, ScanEnv* env) NODE_STATUS_ADD(node, IN_MULTI_ENTRY); } - if (IS_REPEAT_INFINITE(qn->upper) || qn->upper >= 1) { + if (IS_INFINITE_REPEAT(qn->upper) || qn->upper >= 1) { d = tree_min_len(body, env); if (d == 0) { -#ifdef USE_INSISTENT_CHECK_CAPTURES_IN_EMPTY_REPEAT - qn->empty_info = quantifiers_memory_node_info(body); - if (qn->empty_info == BODY_IS_EMPTY_REC) { +#ifdef USE_STUBBORN_CHECK_CAPTURES_IN_EMPTY_REPEAT + qn->emptiness = quantifiers_memory_node_info(body); + if (qn->emptiness == BODY_IS_EMPTY_POSSIBILITY_REC) { if (NODE_TYPE(body) == NODE_BAG && BAG_(body)->type == BAG_MEMORY) { MEM_STATUS_ON(env->bt_mem_end, BAG_(body)->m.regnum); } } #else - qn->empty_info = BODY_IS_EMPTY; + qn->emptiness = BODY_IS_EMPTY_POSSIBILITY; #endif } } - if (IS_REPEAT_INFINITE(qn->upper) || qn->upper >= 2) + if (IS_INFINITE_REPEAT(qn->upper) || qn->upper >= 2) state |= IN_REAL_REPEAT; if (qn->lower != qn->upper) state |= IN_VAR_REPEAT; @@ -4628,7 +4657,7 @@ setup_quant(Node* node, regex_t* reg, int state, ScanEnv* env) /* expand string */ #define EXPAND_STRING_MAX_LENGTH 100 if (NODE_TYPE(body) == NODE_STRING) { - if (!IS_REPEAT_INFINITE(qn->lower) && qn->lower == qn->upper && + if (!IS_INFINITE_REPEAT(qn->lower) && qn->lower == qn->upper && qn->lower > 1 && qn->lower <= EXPAND_STRING_MAX_LENGTH) { int len = NODE_STRING_LEN(body); StrNode* sn = STR_(body); @@ -4646,7 +4675,7 @@ setup_quant(Node* node, regex_t* reg, int state, ScanEnv* env) } } - if (qn->greedy && (qn->empty_info == BODY_IS_NOT_EMPTY)) { + if (qn->greedy && (qn->emptiness == BODY_IS_NOT_EMPTY)) { if (NODE_TYPE(body) == NODE_QUANT) { QuantNode* tqn = QUANT_(body); if (IS_NOT_NULL(tqn->head_exact)) { @@ -4663,7 +4692,7 @@ setup_quant(Node* node, regex_t* reg, int state, ScanEnv* env) } /* setup_tree does the following work. - 1. check empty loop. (set qn->empty_info) + 1. check empty loop. (set qn->emptiness) 2. expand ignore-case in char class. 3. set memory status bit flags. (reg->mem_stats) 4. set qn->head_exact for [push, exact] -> [push_or_jump_exact1, exact]. @@ -4752,10 +4781,10 @@ setup_tree(Node* node, regex_t* reg, int state, ScanEnv* env) r = setup_tree(target, reg, state, env); if (NODE_TYPE(target) == NODE_QUANT) { QuantNode* tqn = QUANT_(target); - if (IS_REPEAT_INFINITE(tqn->upper) && tqn->lower <= 1 && + if (IS_INFINITE_REPEAT(tqn->upper) && tqn->lower <= 1 && tqn->greedy != 0) { /* (?>a*), a*+ etc... */ - if (NODE_IS_SIMPLE_TYPE(NODE_BODY(target))) - NODE_STATUS_ADD(node, STOP_BT_SIMPLE_REPEAT); + if (is_strict_real_node(NODE_BODY(target))) + NODE_STATUS_ADD(node, STRICT_REAL_REPEAT); } } } @@ -5752,7 +5781,7 @@ optimize_nodes(Node* node, OptNode* opt, OptEnv* env) opt->sm.reach_end = 0; } - if (IS_REPEAT_INFINITE(qn->upper)) { + if (IS_INFINITE_REPEAT(qn->upper)) { if (env->mmd.max == 0 && NODE_IS_ANYCHAR(NODE_BODY(node)) && qn->greedy != 0) { if (IS_MULTILINE(CTYPE_OPTION(NODE_QUANT_BODY(qn), env))) @@ -6672,6 +6701,7 @@ onig_is_code_in_cc(OnigEncoding enc, OnigCodePoint code, CClassNode* cc) } else { len = ONIGENC_CODE_TO_MBCLEN(enc, code); + if (len < 0) return 0; } return onig_is_code_in_cc_len(len, code, cc); } diff --git a/src/regenc.c b/src/regenc.c index 6376565..9fab721 100644 --- a/src/regenc.c +++ b/src/regenc.c @@ -853,6 +853,8 @@ onigenc_mbn_is_mbc_ambiguous(OnigEncoding enc, OnigCaseFoldType flag, extern int onigenc_mb2_code_to_mbclen(OnigCodePoint code) { + if ((code & (~0xffff)) != 0) return ONIGERR_INVALID_CODE_POINT_VALUE; + if ((code & 0xff00) != 0) return 2; else return 1; } diff --git a/src/regerror.c b/src/regerror.c index 7564827..e6d1806 100644 --- a/src/regerror.c +++ b/src/regerror.c @@ -257,6 +257,23 @@ static int to_ascii(OnigEncoding enc, UChar *s, UChar *end, } +extern int +onig_is_error_code_needs_param(int code) +{ + switch (code) { + case ONIGERR_UNDEFINED_NAME_REFERENCE: + case ONIGERR_UNDEFINED_GROUP_REFERENCE: + case ONIGERR_MULTIPLEX_DEFINED_NAME: + case ONIGERR_MULTIPLEX_DEFINITION_NAME_CALL: + case ONIGERR_INVALID_GROUP_NAME: + case ONIGERR_INVALID_CHAR_IN_GROUP_NAME: + case ONIGERR_INVALID_CHAR_PROPERTY_NAME: + return 1; + default: + return 0; + } +} + /* for ONIG_MAX_ERROR_MESSAGE_LEN */ #define MAX_ERROR_PAR_LEN 30 diff --git a/src/regexec.c b/src/regexec.c index 6618996..f957b75 100644 --- a/src/regexec.c +++ b/src/regexec.c @@ -980,6 +980,8 @@ onig_region_copy(OnigRegion* to, OnigRegion* from) #define STK_CALL_FRAME 0x0400 #define STK_RETURN 0x0500 #define STK_SAVE_VAL 0x0600 +#define STK_PREC_READ_START 0x0700 +#define STK_PREC_READ_END 0x0800 /* stack type check mask */ #define STK_MASK_POP_USED STK_ALT_FLAG @@ -1544,8 +1546,8 @@ stack_double(int is_alloca, char** arg_alloc_base, #define STACK_PUSH_ALT(pat,s,sprev) STACK_PUSH(STK_ALT,pat,s,sprev) #define STACK_PUSH_SUPER_ALT(pat,s,sprev) STACK_PUSH(STK_SUPER_ALT,pat,s,sprev) -#define STACK_PUSH_POS(s,sprev) \ - STACK_PUSH(STK_TO_VOID_START,(Operation* )0,s,sprev) +#define STACK_PUSH_PREC_READ_START(s,sprev) \ + STACK_PUSH(STK_PREC_READ_START,(Operation* )0,s,sprev) #define STACK_PUSH_ALT_PREC_READ_NOT(pat,s,sprev) \ STACK_PUSH(STK_ALT_PREC_READ_NOT,pat,s,sprev) #define STACK_PUSH_TO_VOID_START STACK_PUSH_TYPE(STK_TO_VOID_START) @@ -1887,6 +1889,27 @@ stack_double(int is_alloca, char** arg_alloc_base, }\ } while(0) +#define STACK_GET_PREC_READ_START(k) do {\ + int level = 0;\ + k = stk;\ + while (1) {\ + k--;\ + STACK_BASE_CHECK(k, "STACK_GET_PREC_READ_START");\ + if (IS_TO_VOID_TARGET(k)) {\ + k->type = STK_VOID;\ + }\ + else if (k->type == STK_PREC_READ_START) {\ + if (level == 0) {\ + break;\ + }\ + level--;\ + }\ + else if (k->type == STK_PREC_READ_END) {\ + level++;\ + }\ + }\ +} while(0) + #define STACK_EMPTY_CHECK(isnull,sid,s) do {\ StackType* k = stk;\ while (1) {\ @@ -1913,7 +1936,7 @@ stack_double(int is_alloca, char** arg_alloc_base, }\ } while (0) -#ifdef USE_INSISTENT_CHECK_CAPTURES_IN_EMPTY_REPEAT +#ifdef USE_STUBBORN_CHECK_CAPTURES_IN_EMPTY_REPEAT #define STACK_EMPTY_CHECK_MEM(isnull,sid,s,reg) do {\ StackType* k = stk;\ while (1) {\ @@ -1927,9 +1950,10 @@ stack_double(int is_alloca, char** arg_alloc_base, }\ else {\ UChar* endp;\ + int level = 0;\ (isnull) = 1;\ while (k < stk) {\ - if (k->type == STK_MEM_START) {\ + if (k->type == STK_MEM_START && level == 0) {\ STACK_MEM_START_GET_PREV_END_ADDR(k, reg, endp);\ if (endp == 0) {\ (isnull) = 0; break;\ @@ -1941,6 +1965,12 @@ stack_double(int is_alloca, char** arg_alloc_base, (isnull) = -1; /* empty, but position changed */ \ }\ }\ + else if (k->type == STK_PREC_READ_START) {\ + level++;\ + }\ + else if (k->type == STK_PREC_READ_END) {\ + level--;\ + }\ k++;\ }\ break;\ @@ -1965,10 +1995,11 @@ stack_double(int is_alloca, char** arg_alloc_base, }\ else {\ UChar* endp;\ + int prec_level = 0;\ (isnull) = 1;\ while (k < stk) {\ if (k->type == STK_MEM_START) {\ - if (level == 0) {\ + if (level == 0 && prec_level == 0) {\ STACK_MEM_START_GET_PREV_END_ADDR(k, reg, endp);\ if (endp == 0) {\ (isnull) = 0; break;\ @@ -1987,6 +2018,12 @@ stack_double(int is_alloca, char** arg_alloc_base, else if (k->type == STK_EMPTY_CHECK_END) {\ if (k->zid == (sid)) level--;\ }\ + else if (k->type == STK_PREC_READ_START) {\ + prec_level++;\ + }\ + else if (k->type == STK_PREC_READ_END) {\ + prec_level--;\ + }\ k++;\ }\ break;\ @@ -2023,7 +2060,7 @@ stack_double(int is_alloca, char** arg_alloc_base, }\ }\ } while(0) -#endif /* USE_INSISTENT_CHECK_CAPTURES_IN_EMPTY_REPEAT */ +#endif /* USE_STUBBORN_CHECK_CAPTURES_IN_EMPTY_REPEAT */ #define STACK_GET_REPEAT(sid, k) do {\ int level = 0;\ @@ -2968,6 +3005,7 @@ match_at(regex_t* reg, const UChar* str, const UChar* end, NEXT_OUT; CASE_OP(CCLASS_MB) + DATA_ENSURE(1); if (! ONIGENC_IS_MBC_HEAD(encode, s)) goto fail; cclass_mb: @@ -3441,11 +3479,13 @@ match_at(regex_t* reg, const UChar* str, const UChar* end, ? STACK_AT(mem_end_stk[mem])->u.mem.pstr : (UChar* )((void* )mem_end_stk[mem])); n = (int )(pend - pstart); - DATA_ENSURE(n); - sprev = s; - STRING_CMP(pstart, s, n); - while (sprev + (len = enclen(encode, sprev)) < s) - sprev += len; + if (n != 0) { + DATA_ENSURE(n); + sprev = s; + STRING_CMP(s, pstart, n); + while (sprev + (len = enclen(encode, sprev)) < s) + sprev += len; + } } INC_OP; JUMP_OUT; @@ -3468,11 +3508,13 @@ match_at(regex_t* reg, const UChar* str, const UChar* end, ? STACK_AT(mem_end_stk[mem])->u.mem.pstr : (UChar* )((void* )mem_end_stk[mem])); n = (int )(pend - pstart); - DATA_ENSURE(n); - sprev = s; - STRING_CMP_IC(case_fold_flag, pstart, &s, n); - while (sprev + (len = enclen(encode, sprev)) < s) - sprev += len; + if (n != 0) { + DATA_ENSURE(n); + sprev = s; + STRING_CMP_IC(case_fold_flag, pstart, &s, n); + while (sprev + (len = enclen(encode, sprev)) < s) + sprev += len; + } } INC_OP; JUMP_OUT; @@ -3498,15 +3540,16 @@ match_at(regex_t* reg, const UChar* str, const UChar* end, ? STACK_AT(mem_end_stk[mem])->u.mem.pstr : (UChar* )((void* )mem_end_stk[mem])); n = (int )(pend - pstart); - DATA_ENSURE(n); - sprev = s; - swork = s; - STRING_CMP_VALUE(pstart, swork, n, is_fail); - if (is_fail) continue; - s = swork; - while (sprev + (len = enclen(encode, sprev)) < s) - sprev += len; - + if (n != 0) { + DATA_ENSURE(n); + sprev = s; + swork = s; + STRING_CMP_VALUE(swork, pstart, n, is_fail); + if (is_fail) continue; + s = swork; + while (sprev + (len = enclen(encode, sprev)) < s) + sprev += len; + } break; /* success */ } if (i == tlen) goto fail; @@ -3535,15 +3578,16 @@ match_at(regex_t* reg, const UChar* str, const UChar* end, ? STACK_AT(mem_end_stk[mem])->u.mem.pstr : (UChar* )((void* )mem_end_stk[mem])); n = (int )(pend - pstart); - DATA_ENSURE(n); - sprev = s; - swork = s; - STRING_CMP_VALUE_IC(case_fold_flag, pstart, &swork, n, is_fail); - if (is_fail) continue; - s = swork; - while (sprev + (len = enclen(encode, sprev)) < s) - sprev += len; - + if (n != 0) { + DATA_ENSURE(n); + sprev = s; + swork = s; + STRING_CMP_VALUE_IC(case_fold_flag, pstart, &swork, n, is_fail); + if (is_fail) continue; + s = swork; + while (sprev + (len = enclen(encode, sprev)) < s) + sprev += len; + } break; /* success */ } if (i == tlen) goto fail; @@ -3560,6 +3604,7 @@ match_at(regex_t* reg, const UChar* str, const UChar* end, int len; int level; MemNumType* mems; + UChar* ssave; n = 0; backref_with_level: @@ -3567,10 +3612,11 @@ match_at(regex_t* reg, const UChar* str, const UChar* end, tlen = p->backref_general.num; mems = tlen == 1 ? &(p->backref_general.n1) : p->backref_general.ns; - sprev = s; + ssave = s; if (backref_match_at_nested_level(reg, stk, stk_base, n, case_fold_flag, level, (int )tlen, mems, &s, end)) { - if (sprev < end) { + if (ssave != s) { + sprev = ssave; while (sprev + (len = enclen(encode, sprev)) < s) sprev += len; } @@ -3658,7 +3704,7 @@ match_at(regex_t* reg, const UChar* str, const UChar* end, } JUMP_OUT; -#ifdef USE_INSISTENT_CHECK_CAPTURES_IN_EMPTY_REPEAT +#ifdef USE_STUBBORN_CHECK_CAPTURES_IN_EMPTY_REPEAT CASE_OP(EMPTY_CHECK_END_MEMST) { int is_empty; @@ -3683,7 +3729,7 @@ match_at(regex_t* reg, const UChar* str, const UChar* end, int is_empty; mem = p->empty_check_end.mem; /* mem: null check id */ -#ifdef USE_INSISTENT_CHECK_CAPTURES_IN_EMPTY_REPEAT +#ifdef USE_STUBBORN_CHECK_CAPTURES_IN_EMPTY_REPEAT STACK_EMPTY_CHECK_MEM_REC(is_empty, mem, s, reg); #else STACK_EMPTY_CHECK_REC(is_empty, mem, s); @@ -3851,14 +3897,15 @@ match_at(regex_t* reg, const UChar* str, const UChar* end, goto repeat_inc_ng; CASE_OP(PREC_READ_START) - STACK_PUSH_POS(s, sprev); + STACK_PUSH_PREC_READ_START(s, sprev); INC_OP; JUMP_OUT; CASE_OP(PREC_READ_END) - STACK_EXEC_TO_VOID(stkp); + STACK_GET_PREC_READ_START(stkp); s = stkp->u.state.pstr; sprev = stkp->u.state.pstr_prev; + STACK_PUSH(STK_PREC_READ_END,0,0,0); INC_OP; JUMP_OUT; @@ -5443,6 +5490,9 @@ onig_builtin_error(OnigCalloutArgs* args, void* user_data ARG_UNUSED) if (n >= 0) { n = ONIGERR_INVALID_CALLOUT_BODY; } + else if (onig_is_error_code_needs_param(n)) { + n = ONIGERR_INVALID_CALLOUT_BODY; + } return n; } diff --git a/src/regext.c b/src/regext.c index fa4b360..965c793 100644 --- a/src/regext.c +++ b/src/regext.c @@ -29,6 +29,7 @@ #include "regint.h" +#if 0 static void conv_ext0be32(const UChar* s, const UChar* end, UChar* conv) { @@ -158,6 +159,7 @@ conv_encoding(OnigEncoding from, OnigEncoding to, const UChar* s, const UChar* e return ONIGERR_NOT_SUPPORTED_ENCODING_COMBINATION; } +#endif extern int onig_new_deluxe(regex_t** reg, const UChar* pattern, const UChar* pattern_end, @@ -169,9 +171,7 @@ onig_new_deluxe(regex_t** reg, const UChar* pattern, const UChar* pattern_end, if (IS_NOT_NULL(einfo)) einfo->par = (UChar* )NULL; if (ci->pattern_enc != ci->target_enc) { - r = conv_encoding(ci->pattern_enc, ci->target_enc, pattern, pattern_end, - &cpat, &cpat_end); - if (r != 0) return r; + return ONIGERR_NOT_SUPPORTED_ENCODING_COMBINATION; } else { cpat = (UChar* )pattern; diff --git a/src/regint.h b/src/regint.h index 56767e8..38389a1 100644 --- a/src/regint.h +++ b/src/regint.h @@ -63,7 +63,7 @@ #define USE_CALL #define USE_CALLOUT #define USE_BACKREF_WITH_LEVEL /* \k, \k */ -#define USE_INSISTENT_CHECK_CAPTURES_IN_EMPTY_REPEAT /* /(?:()|())*\2/ */ +#define USE_STUBBORN_CHECK_CAPTURES_IN_EMPTY_REPEAT /* /(?:()|())*\2/ */ #define USE_NEWLINE_AT_END_OF_STRING_HAS_EMPTY_LINE /* /\n$/ =~ "\n" */ #define USE_WARNING_REDUNDANT_NESTED_REPEAT_OPERATOR #define USE_RETRY_LIMIT_IN_MATCH @@ -348,8 +348,8 @@ typedef unsigned int MemStatusType; #define DISABLE_CASE_FOLD_MULTI_CHAR(case_fold_flag) \ ((case_fold_flag) & ~INTERNAL_ONIGENC_CASE_FOLD_MULTI_CHAR) -#define REPEAT_INFINITE -1 -#define IS_REPEAT_INFINITE(n) ((n) == REPEAT_INFINITE) +#define INFINITE_REPEAT -1 +#define IS_INFINITE_REPEAT(n) ((n) == INFINITE_REPEAT) /* bitset */ #define BITS_PER_BYTE 8 diff --git a/src/regparse.c b/src/regparse.c index f1deea3..7f8b1a9 100644 --- a/src/regparse.c +++ b/src/regparse.c @@ -77,6 +77,7 @@ OnigSyntaxType OnigSyntaxOniguruma = { ONIG_SYN_CAPTURE_ONLY_NAMED_GROUP | ONIG_SYN_ALLOW_MULTIPLEX_DEFINITION_NAME | ONIG_SYN_FIXED_INTERVAL_IS_GREEDY_ONLY | + ONIG_SYN_ALLOW_INVALID_CODE_END_OF_RANGE_IN_CC | ONIG_SYN_WARN_CC_OP_NOT_ESCAPED | ONIG_SYN_WARN_REDUNDANT_NESTED_REPEAT ) , ONIG_OPTION_NONE @@ -1093,6 +1094,35 @@ onig_name_to_group_numbers(regex_t* reg, const UChar* name, return e->back_num; } +static int +name_to_group_numbers(ScanEnv* env, const UChar* name, const UChar* name_end, + int** nums) +{ + regex_t* reg; + NameEntry* e; + + reg = env->reg; + e = name_find(reg, name, name_end); + + if (IS_NULL(e)) { + onig_scan_env_set_error_string(env, ONIGERR_UNDEFINED_NAME_REFERENCE, + (UChar* )name, (UChar* )name_end); + return ONIGERR_UNDEFINED_NAME_REFERENCE; + } + + switch (e->back_num) { + case 0: + break; + case 1: + *nums = &(e->back_ref1); + break; + default: + *nums = e->back_refs; + break; + } + return e->back_num; +} + extern int onig_name_to_backref_number(regex_t* reg, const UChar* name, const UChar* name_end, OnigRegion *region) @@ -1869,8 +1899,8 @@ callout_tag_table_new(CalloutTagTable** rt) } static int -callout_tag_entry_raw(CalloutTagTable* t, UChar* name, UChar* name_end, - CalloutTagVal entry_val) +callout_tag_entry_raw(ScanEnv* env, CalloutTagTable* t, UChar* name, + UChar* name_end, CalloutTagVal entry_val) { int r; CalloutTagVal val; @@ -1879,8 +1909,11 @@ callout_tag_entry_raw(CalloutTagTable* t, UChar* name, UChar* name_end, return ONIGERR_INVALID_CALLOUT_TAG_NAME; val = callout_tag_find(t, name, name_end); - if (val >= 0) + if (val >= 0) { + onig_scan_env_set_error_string(env, ONIGERR_MULTIPLEX_DEFINED_NAME, + name, name_end); return ONIGERR_MULTIPLEX_DEFINED_NAME; + } r = onig_st_insert_strend(t, name, name_end, (HashDataType )entry_val); if (r < 0) return r; @@ -1909,7 +1942,7 @@ ext_ensure_tag_table(regex_t* reg) } static int -callout_tag_entry(regex_t* reg, UChar* name, UChar* name_end, +callout_tag_entry(ScanEnv* env, regex_t* reg, UChar* name, UChar* name_end, CalloutTagVal entry_val) { int r; @@ -1921,7 +1954,7 @@ callout_tag_entry(regex_t* reg, UChar* name, UChar* name_end, ext = onig_get_regex_ext(reg); CHECK_NULL_RETURN_MEMERR(ext); - r = callout_tag_entry_raw(ext->tag_table, name, name_end, entry_val); + r = callout_tag_entry_raw(env, ext->tag_table, name, name_end, entry_val); e = onig_reg_callout_list_at(reg, (int )entry_val); CHECK_NULL_RETURN_MEMERR(e); @@ -2391,10 +2424,10 @@ node_new_quantifier(int lower, int upper, int by_number) CHECK_NULL_RETURN(node); NODE_SET_TYPE(node, NODE_QUANT); - QUANT_(node)->lower = lower; - QUANT_(node)->upper = upper; - QUANT_(node)->greedy = 1; - QUANT_(node)->empty_info = BODY_IS_NOT_EMPTY; + QUANT_(node)->lower = lower; + QUANT_(node)->upper = upper; + QUANT_(node)->greedy = 1; + QUANT_(node)->emptiness = BODY_IS_NOT_EMPTY; QUANT_(node)->head_exact = NULL_NODE; QUANT_(node)->next_head_exact = NULL_NODE; QUANT_(node)->is_refered = 0; @@ -2694,7 +2727,7 @@ make_text_segment(Node** node, ScanEnv* env) ns[0] = x; ns[1] = NULL_NODE; - x = node_new_quantifier(0, REPEAT_INFINITE, 1); + x = node_new_quantifier(0, INFINITE_REPEAT, 1); if (IS_NULL(x)) goto err; NODE_BODY(x) = ns[0]; @@ -3044,7 +3077,7 @@ make_absent_tree(Node** node, Node* absent, Node* expr, int is_range_cutter, if (expr == NULL_NODE) { /* default expr \O* */ - quant = node_new_quantifier(0, REPEAT_INFINITE, 0); + quant = node_new_quantifier(0, INFINITE_REPEAT, 0); if (IS_NULL(quant)) goto err0; r = node_new_true_anychar(&body, env); @@ -3086,7 +3119,7 @@ make_absent_tree(Node** node, Node* absent, Node* expr, int is_range_cutter, if (r != 0) goto err; possessive = 1; - r = make_absent_engine(&ns[2], id1, absent, ns[3], 0, REPEAT_INFINITE, + r = make_absent_engine(&ns[2], id1, absent, ns[3], 0, INFINITE_REPEAT, possessive, is_range_cutter, env); if (r != 0) goto err; @@ -3236,10 +3269,18 @@ node_new_empty(void) static Node* node_new_str_raw_char(UChar c) { + int i; UChar p[1]; + Node* node; p[0] = c; - return node_new_str_raw(p, p + 1); + node = node_new_str_raw(p, p + 1); + + /* clear buf tail */ + for (i = 1; i < NODE_STRING_BUF_SIZE; i++) + STR_(node)->buf[i] = '\0'; + + return node; } static Node* @@ -3275,24 +3316,6 @@ str_node_can_be_split(Node* node, OnigEncoding enc) return 0; } -#ifdef USE_PAD_TO_SHORT_BYTE_CHAR -static int -node_str_head_pad(StrNode* sn, int num, UChar val) -{ - UChar buf[NODE_STRING_BUF_SIZE]; - int i, len; - - len = sn->end - sn->s; - onig_strcpy(buf, sn->s, sn->end); - onig_strcpy(&(sn->s[num]), buf, buf + len); - sn->end += num; - - for (i = 0; i < num; i++) { - sn->s[i] = val; - } -} -#endif - extern int onig_scan_unsigned_number(UChar** src, const UChar* end, OnigEncoding enc) { @@ -3877,19 +3900,19 @@ quantifier_type_num(QuantNode* q) if (q->greedy) { if (q->lower == 0) { if (q->upper == 1) return 0; - else if (IS_REPEAT_INFINITE(q->upper)) return 1; + else if (IS_INFINITE_REPEAT(q->upper)) return 1; } else if (q->lower == 1) { - if (IS_REPEAT_INFINITE(q->upper)) return 2; + if (IS_INFINITE_REPEAT(q->upper)) return 2; } } else { if (q->lower == 0) { if (q->upper == 1) return 3; - else if (IS_REPEAT_INFINITE(q->upper)) return 4; + else if (IS_INFINITE_REPEAT(q->upper)) return 4; } else if (q->lower == 1) { - if (IS_REPEAT_INFINITE(q->upper)) return 5; + if (IS_INFINITE_REPEAT(q->upper)) return 5; } } return -1; @@ -3926,8 +3949,8 @@ onig_reduce_nested_quantifier(Node* pnode, Node* cnode) pnum = quantifier_type_num(p); cnum = quantifier_type_num(c); if (pnum < 0 || cnum < 0) { - if ((p->lower == p->upper) && ! IS_REPEAT_INFINITE(p->upper)) { - if ((c->lower == c->upper) && ! IS_REPEAT_INFINITE(c->upper)) { + if ((p->lower == p->upper) && ! IS_INFINITE_REPEAT(p->upper)) { + if ((c->lower == c->upper) && ! IS_INFINITE_REPEAT(c->upper)) { int n = onig_positive_int_multiply(p->lower, c->lower); if (n >= 0) { p->lower = p->upper = n; @@ -3946,11 +3969,11 @@ onig_reduce_nested_quantifier(Node* pnode, Node* cnode) break; case RQ_A: NODE_BODY(pnode) = NODE_BODY(cnode); - p->lower = 0; p->upper = REPEAT_INFINITE; p->greedy = 1; + p->lower = 0; p->upper = INFINITE_REPEAT; p->greedy = 1; break; case RQ_AQ: NODE_BODY(pnode) = NODE_BODY(cnode); - p->lower = 0; p->upper = REPEAT_INFINITE; p->greedy = 0; + p->lower = 0; p->upper = INFINITE_REPEAT; p->greedy = 0; break; case RQ_QQ: NODE_BODY(pnode) = NODE_BODY(cnode); @@ -3959,13 +3982,13 @@ onig_reduce_nested_quantifier(Node* pnode, Node* cnode) case RQ_P_QQ: NODE_BODY(pnode) = cnode; p->lower = 0; p->upper = 1; p->greedy = 0; - c->lower = 1; c->upper = REPEAT_INFINITE; c->greedy = 1; + c->lower = 1; c->upper = INFINITE_REPEAT; c->greedy = 1; return ; break; case RQ_PQ_Q: NODE_BODY(pnode) = cnode; p->lower = 0; p->upper = 1; p->greedy = 1; - c->lower = 1; c->upper = REPEAT_INFINITE; c->greedy = 0; + c->lower = 1; c->upper = INFINITE_REPEAT; c->greedy = 0; return ; break; case RQ_ASIS: @@ -4158,7 +4181,7 @@ fetch_interval_quantifier(UChar** src, UChar* end, PToken* tok, ScanEnv* env) if (p == prev) { if (non_low != 0) goto invalid; - up = REPEAT_INFINITE; /* {n,} : {n,infinite} */ + up = INFINITE_REPEAT; /* {n,} : {n,infinite} */ } } else { @@ -4178,7 +4201,7 @@ fetch_interval_quantifier(UChar** src, UChar* end, PToken* tok, ScanEnv* env) } if (c != '}') goto invalid; - if (!IS_REPEAT_INFINITE(up) && low > up) { + if (!IS_INFINITE_REPEAT(up) && low > up) { /* {n,m}+ supported case */ if (IS_SYNTAX_OP2(env->syntax, ONIG_SYN_OP2_PLUS_POSSESSIVE_INTERVAL)) return ONIGERR_UPPER_SMALLER_THAN_LOWER_IN_REPEAT_RANGE; @@ -4959,7 +4982,7 @@ fetch_token(PToken* tok, UChar** src, UChar* end, ScanEnv* env) if (! IS_SYNTAX_OP(syn, ONIG_SYN_OP_ESC_ASTERISK_ZERO_INF)) break; tok->type = TK_REPEAT; tok->u.repeat.lower = 0; - tok->u.repeat.upper = REPEAT_INFINITE; + tok->u.repeat.upper = INFINITE_REPEAT; goto greedy_check; break; @@ -4967,7 +4990,7 @@ fetch_token(PToken* tok, UChar** src, UChar* end, ScanEnv* env) if (! IS_SYNTAX_OP(syn, ONIG_SYN_OP_ESC_PLUS_ONE_INF)) break; tok->type = TK_REPEAT; tok->u.repeat.lower = 1; - tok->u.repeat.upper = REPEAT_INFINITE; + tok->u.repeat.upper = INFINITE_REPEAT; goto greedy_check; break; @@ -5358,10 +5381,8 @@ fetch_token(PToken* tok, UChar** src, UChar* end, ScanEnv* env) tok->u.backref.ref1 = back_num; } else { - num = onig_name_to_group_numbers(env->reg, prev, name_end, &backs); + num = name_to_group_numbers(env, prev, name_end, &backs); if (num <= 0) { - onig_scan_env_set_error_string(env, - ONIGERR_UNDEFINED_NAME_REFERENCE, prev, name_end); return ONIGERR_UNDEFINED_NAME_REFERENCE; } if (IS_SYNTAX_BV(syn, ONIG_SYN_STRICT_CHECK_BACKREF)) { @@ -5514,7 +5535,7 @@ fetch_token(PToken* tok, UChar** src, UChar* end, ScanEnv* env) #endif tok->type = TK_REPEAT; tok->u.repeat.lower = 0; - tok->u.repeat.upper = REPEAT_INFINITE; + tok->u.repeat.upper = INFINITE_REPEAT; goto greedy_check; break; @@ -5525,7 +5546,7 @@ fetch_token(PToken* tok, UChar** src, UChar* end, ScanEnv* env) #endif tok->type = TK_REPEAT; tok->u.repeat.lower = 1; - tok->u.repeat.upper = REPEAT_INFINITE; + tok->u.repeat.upper = INFINITE_REPEAT; goto greedy_check; break; @@ -5608,7 +5629,7 @@ fetch_token(PToken* tok, UChar** src, UChar* end, ScanEnv* env) tok->u.call.gnum = 0; tok->u.call.name = p; PINC; - if (! PPEEK_IS(')')) return ONIGERR_INVALID_GROUP_NAME; + if (! PPEEK_IS(')')) return ONIGERR_UNDEFINED_GROUP_OPTION; tok->u.call.name_end = p; break; @@ -6249,6 +6270,7 @@ parse_char_class(Node** np, PToken* tok, UChar** src, UChar* end, ScanEnv* env) env->parse_depth++; if (env->parse_depth > ParseDepthLimit) return ONIGERR_PARSE_DEPTH_LIMIT_OVER; + prev_cc = (CClassNode* )NULL; r = fetch_token_in_cc(tok, src, end, env); if (r == TK_CHAR && tok->u.c == '^' && tok->escaped == 0) { @@ -6301,10 +6323,11 @@ parse_char_class(Node** np, PToken* tok, UChar** src, UChar* end, ScanEnv* env) case TK_RAW_BYTE: /* tok->base != 0 : octal or hexadec. */ if (! ONIGENC_IS_SINGLEBYTE(env->enc) && tok->base != 0) { + int i, j; UChar buf[ONIGENC_CODE_TO_MBC_MAXLEN]; UChar* bufe = buf + ONIGENC_CODE_TO_MBC_MAXLEN; UChar* psave = p; - int i, base = tok->base; + int base = tok->base; buf[0] = tok->u.c; for (i = 1; i < ONIGENC_MBC_MAXLEN(env->enc); i++) { @@ -6322,6 +6345,9 @@ parse_char_class(Node** np, PToken* tok, UChar** src, UChar* end, ScanEnv* env) goto err; } + /* clear buf tail */ + for (j = i; j < ONIGENC_CODE_TO_MBC_MAXLEN; j++) buf[j] = '\0'; + len = enclen(env->enc, buf); if (i < len) { r = ONIGERR_TOO_SHORT_MULTI_BYTE_STRING; @@ -6359,8 +6385,13 @@ parse_char_class(Node** np, PToken* tok, UChar** src, UChar* end, ScanEnv* env) val_entry: len = ONIGENC_CODE_TO_MBCLEN(env->enc, v); if (len < 0) { - r = len; - goto err; + if (state != CCS_RANGE || + ! IS_SYNTAX_BV(env->syntax, + ONIG_SYN_ALLOW_INVALID_CODE_END_OF_RANGE_IN_CC) || + v < 0x100 || ONIGENC_MBC_MAXLEN(env->enc) == 1) { + r = len; + goto err; + } } in_type = (len == 1 ? CCV_SB : CCV_CODE_POINT); val_entry2: @@ -6673,7 +6704,7 @@ parse_callout_of_contents(Node** np, int cterm, UChar** src, UChar* end, ScanEnv } if (tag_start != tag_end) { - r = callout_tag_entry(env->reg, tag_start, tag_end, num); + r = callout_tag_entry(env, env->reg, tag_start, tag_end, num); if (r != ONIG_NORMAL) return r; } @@ -6994,7 +7025,7 @@ parse_callout_of_name(Node** np, int cterm, UChar** src, UChar* end, ScanEnv* en } if (tag_start != tag_end) { - r = callout_tag_entry(env->reg, tag_start, tag_end, num); + r = callout_tag_entry(env, env->reg, tag_start, tag_end, num); if (r != ONIG_NORMAL) return r; } @@ -7271,10 +7302,8 @@ parse_bag(Node** np, PToken* tok, int term, UChar** src, UChar* end, int num; int* backs; - num = onig_name_to_group_numbers(env->reg, prev, name_end, &backs); + num = name_to_group_numbers(env, prev, name_end, &backs); if (num <= 0) { - onig_scan_env_set_error_string(env, - ONIGERR_UNDEFINED_NAME_REFERENCE, prev, name_end); return ONIGERR_UNDEFINED_NAME_REFERENCE; } if (IS_SYNTAX_BV(env->syntax, ONIG_SYN_STRICT_CHECK_BACKREF)) { @@ -7414,6 +7443,7 @@ parse_bag(Node** np, PToken* tok, int term, UChar** src, UChar* end, } break; +#ifdef USE_CAPTURE_HISTORY case '@': if (IS_SYNTAX_OP2(env->syntax, ONIG_SYN_OP2_ATMARK_CAPTURE_HISTORY)) { if (IS_SYNTAX_OP2(env->syntax, ONIG_SYN_OP2_QMARK_LT_NAMED_GROUP)) { @@ -7441,6 +7471,7 @@ parse_bag(Node** np, PToken* tok, int term, UChar** src, UChar* end, return ONIGERR_UNDEFINED_GROUP_OPTION; } break; +#endif #ifdef USE_POSIXLINE_OPTION case 'p': @@ -7688,7 +7719,7 @@ set_quantifier(Node* qnode, Node* target, int group, ScanEnv* env) if (targetq_num >= 0 && nestq_num < 0) { if (targetq_num == 1 || targetq_num == 2) { /* * or + */ /* (?:a*){n,m}, (?:a+){n,m} => (?:a*){n,n}, (?:a+){n,n} */ - if (! IS_REPEAT_INFINITE(qn->upper) && qn->upper > 1 && qn->greedy) { + if (! IS_INFINITE_REPEAT(qn->upper) && qn->upper > 1 && qn->greedy) { qn->upper = (qn->lower == 0 ? 1 : qn->lower); } } @@ -7826,14 +7857,18 @@ static int parse_exp(Node** np, PToken* tok, int term, UChar** src, UChar* end, ScanEnv* env, int group_head) { - int r, len, group = 0; + int r, len, group; Node* qn; Node** tp; + unsigned int parse_depth; + group = 0; *np = NULL; if (tok->type == (enum TokenSyms )term) goto end_of_token; + parse_depth = env->parse_depth; + switch (tok->type) { case TK_ALT: case TK_EOT: @@ -7914,36 +7949,29 @@ parse_exp(Node** np, PToken* tok, int term, UChar** src, UChar* end, len = 1; while (1) { if (len >= ONIGENC_MBC_MINLEN(env->enc)) { - if (len == enclen(env->enc, STR_(*np)->s)) {/* should not enclen_end() */ + if (len == enclen(env->enc, STR_(*np)->s)) { r = fetch_token(tok, src, end, env); - NODE_STRING_CLEAR_RAW(*np); - goto string_end; + goto tk_raw_byte_end; } } r = fetch_token(tok, src, end, env); if (r < 0) return r; - if (r != TK_RAW_BYTE) { - /* Don't use this, it is wrong for little endian encodings. */ -#ifdef USE_PAD_TO_SHORT_BYTE_CHAR - int rem; - if (len < ONIGENC_MBC_MINLEN(env->enc)) { - rem = ONIGENC_MBC_MINLEN(env->enc) - len; - (void )node_str_head_pad(STR_(*np), rem, (UChar )0); - if (len + rem == enclen(env->enc, STR_(*np)->s)) { - NODE_STRING_CLEAR_RAW(*np); - goto string_end; - } - } -#endif + if (r != TK_RAW_BYTE) return ONIGERR_TOO_SHORT_MULTI_BYTE_STRING; - } r = node_str_cat_char(*np, (UChar )tok->u.c); if (r < 0) return r; len++; } + + tk_raw_byte_end: + if (! ONIGENC_IS_VALID_MBC_STRING(env->enc, STR_(*np)->s, STR_(*np)->end)) + return ONIGERR_INVALID_WIDE_CHAR_VALUE; + + NODE_STRING_CLEAR_RAW(*np); + goto string_end; } break; @@ -8055,7 +8083,7 @@ parse_exp(Node** np, PToken* tok, int term, UChar** src, UChar* end, case TK_ANYCHAR_ANYTIME: *np = node_new_anychar(); CHECK_NULL_RETURN_MEMERR(*np); - qn = node_new_quantifier(0, REPEAT_INFINITE, 0); + qn = node_new_quantifier(0, INFINITE_REPEAT, 0); CHECK_NULL_RETURN_MEMERR(qn); NODE_BODY(qn) = *np; *np = qn; @@ -8158,6 +8186,10 @@ parse_exp(Node** np, PToken* tok, int term, UChar** src, UChar* end, if (is_invalid_quantifier_target(*tp)) return ONIGERR_TARGET_OF_REPEAT_OPERATOR_INVALID; + parse_depth++; + if (parse_depth > ParseDepthLimit) + return ONIGERR_PARSE_DEPTH_LIMIT_OVER; + qn = node_new_quantifier(tok->u.repeat.lower, tok->u.repeat.upper, r == TK_INTERVAL); CHECK_NULL_RETURN_MEMERR(qn); diff --git a/src/regparse.h b/src/regparse.h index b7a2867..231f7b5 100644 --- a/src/regparse.h +++ b/src/regparse.h @@ -66,11 +66,11 @@ enum GimmickType { #endif }; -enum BodyEmpty { - BODY_IS_NOT_EMPTY = 0, - BODY_IS_EMPTY = 1, - BODY_IS_EMPTY_MEM = 2, - BODY_IS_EMPTY_REC = 3 +enum BodyEmptyType { + BODY_IS_NOT_EMPTY = 0, + BODY_IS_EMPTY_POSSIBILITY = 1, + BODY_IS_EMPTY_POSSIBILITY_MEM = 2, + BODY_IS_EMPTY_POSSIBILITY_REC = 3 }; typedef struct { @@ -101,7 +101,7 @@ typedef struct { int lower; int upper; int greedy; - enum BodyEmpty empty_info; + enum BodyEmptyType emptiness; struct _Node* head_exact; struct _Node* next_head_exact; int is_refered; /* include called node. don't eliminate even if {0} */ @@ -252,10 +252,6 @@ typedef struct _Node { #define NODE_BIT_CALL NODE_TYPE2BIT(NODE_CALL) #define NODE_BIT_GIMMICK NODE_TYPE2BIT(NODE_GIMMICK) -#define NODE_IS_SIMPLE_TYPE(node) \ - ((NODE_TYPE2BIT(NODE_TYPE(node)) & \ - (NODE_BIT_STRING | NODE_BIT_CCLASS | NODE_BIT_CTYPE | NODE_BIT_BACKREF)) != 0) - #define NODE_TYPE(node) ((node)->u.base.node_type) #define NODE_SET_TYPE(node, ntype) (node)->u.base.node_type = (ntype) @@ -314,7 +310,7 @@ typedef struct _Node { #define NODE_ST_CLEN_FIXED (1<<2) #define NODE_ST_MARK1 (1<<3) #define NODE_ST_MARK2 (1<<4) -#define NODE_ST_STOP_BT_SIMPLE_REPEAT (1<<5) +#define NODE_ST_STRICT_REAL_REPEAT (1<<5) #define NODE_ST_RECURSION (1<<6) #define NODE_ST_CALLED (1<<7) #define NODE_ST_ADDR_FIXED (1<<8) @@ -357,8 +353,8 @@ typedef struct _Node { #define NODE_IS_SUPER(node) ((NODE_STATUS(node) & NODE_ST_SUPER) != 0) #define NODE_IS_PROHIBIT_RECURSION(node) \ ((NODE_STATUS(node) & NODE_ST_PROHIBIT_RECURSION) != 0) -#define NODE_IS_STOP_BT_SIMPLE_REPEAT(node) \ - ((NODE_STATUS(node) & NODE_ST_STOP_BT_SIMPLE_REPEAT) != 0) +#define NODE_IS_STRICT_REAL_REPEAT(node) \ + ((NODE_STATUS(node) & NODE_ST_STRICT_REAL_REPEAT) != 0) #define NODE_BODY(node) ((node)->u.base.body) #define NODE_QUANT_BODY(node) ((node)->body) diff --git a/src/utf16_be.c b/src/utf16_be.c index 22bf74d..b66d868 100644 --- a/src/utf16_be.c +++ b/src/utf16_be.c @@ -2,7 +2,7 @@ utf16_be.c - Oniguruma (regular expression library) **********************************************************************/ /*- - * Copyright (c) 2002-2018 K.Kosako + * Copyright (c) 2002-2019 K.Kosako * All rights reserved. * * Redistribution and use in source and binary forms, with or without @@ -103,7 +103,25 @@ utf16be_mbc_enc_len(const UChar* p) static int is_valid_mbc_string(const UChar* s, const UChar* end) { - return onigenc_length_check_is_valid_mbc_string(ONIG_ENCODING_UTF16_BE, s, end); + while (s < end) { + int len = utf16be_mbc_enc_len(s); + if (len == 4) { + if (s + 2 >= end) + return FALSE; + if (! UTF16_IS_SURROGATE_SECOND(*(s+2))) + return FALSE; + } + else + if (UTF16_IS_SURROGATE_SECOND(*s)) + return FALSE; + + s += len; + } + + if (s != end) + return FALSE; + else + return TRUE; } static int @@ -146,7 +164,15 @@ utf16be_mbc_to_code(const UChar* p, const UChar* end ARG_UNUSED) static int utf16be_code_to_mbclen(OnigCodePoint code) { - return (code > 0xffff ? 4 : 2); + if (code > 0xffff) { + if (code > 0x10ffff) + return ONIGERR_INVALID_CODE_POINT_VALUE; + else + return 4; + } + else { + return 2; + } } static int @@ -243,7 +269,8 @@ utf16be_left_adjust_char_head(const UChar* start, const UChar* s) s--; } - if (UTF16_IS_SURROGATE_SECOND(*s) && s > start + 1) + if (UTF16_IS_SURROGATE_SECOND(*s) && s > start + 1 && + UTF16_IS_SURROGATE_FIRST(*(s-2))) s -= 2; return (UChar* )s; diff --git a/src/utf16_le.c b/src/utf16_le.c index 4b231c6..cdc74b0 100644 --- a/src/utf16_le.c +++ b/src/utf16_le.c @@ -2,7 +2,7 @@ utf16_le.c - Oniguruma (regular expression library) **********************************************************************/ /*- - * Copyright (c) 2002-2018 K.Kosako + * Copyright (c) 2002-2019 K.Kosako * All rights reserved. * * Redistribution and use in source and binary forms, with or without @@ -95,7 +95,15 @@ static const int EncLen_UTF16[] = { static int utf16le_code_to_mbclen(OnigCodePoint code) { - return (code > 0xffff ? 4 : 2); + if (code > 0xffff) { + if (code > 0x10ffff) + return ONIGERR_INVALID_CODE_POINT_VALUE; + else + return 4; + } + else { + return 2; + } } static int @@ -110,7 +118,16 @@ is_valid_mbc_string(const UChar* p, const UChar* end) const UChar* end1 = end - 1; while (p < end1) { - p += utf16le_mbc_enc_len(p); + int len = utf16le_mbc_enc_len(p); + if (len == 4) { + if (p + 3 < end && ! UTF16_IS_SURROGATE_SECOND(*(p + 3))) + return FALSE; + } + else + if (UTF16_IS_SURROGATE_SECOND(*(p + 1))) + return FALSE; + + p += len; } if (p != end) @@ -252,7 +269,8 @@ utf16le_left_adjust_char_head(const UChar* start, const UChar* s) s--; } - if (UTF16_IS_SURROGATE_SECOND(*(s+1)) && s > start + 1) + if (UTF16_IS_SURROGATE_SECOND(*(s+1)) && s > start + 1 && + UTF16_IS_SURROGATE_FIRST(*(s-1))) s -= 2; return (UChar* )s; -- cgit v1.2.3