From fc9ba4264eafbb5a6ec0f3cc4cd2e1964c9b8fcf Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?J=C3=B6rg=20Frings-F=C3=BCrst?= Date: Sun, 8 Nov 2020 10:58:30 +0100 Subject: New upstream version 6.9.6 --- src/regcomp.c | 310 ++++++++++++++++++++++++++++++++++++++++++++++------------ 1 file changed, 250 insertions(+), 60 deletions(-) (limited to 'src/regcomp.c') diff --git a/src/regcomp.c b/src/regcomp.c index 4d5b78f..dd2b328 100644 --- a/src/regcomp.c +++ b/src/regcomp.c @@ -133,6 +133,7 @@ ops_init(regex_t* reg, int init_alloc_size) size = sizeof(Operation) * init_alloc_size; p = (Operation* )xrealloc(reg->ops, size); CHECK_NULL_RETURN_MEMERR(p); + reg->ops = p; #ifdef USE_DIRECT_THREADED_CODE { enum OpCode* cp; @@ -144,13 +145,12 @@ ops_init(regex_t* reg, int init_alloc_size) #endif } else { - p = (Operation* )0; + reg->ops = (Operation* )0; #ifdef USE_DIRECT_THREADED_CODE reg->ocs = (enum OpCode* )0; #endif } - reg->ops = p; reg->ops_curr = 0; /* !!! not yet done ops_new() */ reg->ops_alloc = init_alloc_size; reg->ops_used = 0; @@ -176,6 +176,7 @@ ops_expand(regex_t* reg, int n) size = sizeof(Operation) * n; p = (Operation* )xrealloc(reg->ops, size); CHECK_NULL_RETURN_MEMERR(p); + reg->ops = p; #ifdef USE_DIRECT_THREADED_CODE size = sizeof(enum OpCode) * n; @@ -184,7 +185,6 @@ ops_expand(regex_t* reg, int n) reg->ocs = cp; #endif - reg->ops = p; reg->ops_alloc = n; if (reg->ops_used == 0) reg->ops_curr = 0; @@ -265,10 +265,12 @@ ops_free(regex_t* reg) case OP_BACKREF1: case OP_BACKREF2: case OP_BACKREF_N: case OP_BACKREF_N_IC: break; case OP_BACKREF_MULTI: case OP_BACKREF_MULTI_IC: + case OP_BACKREF_CHECK: +#ifdef USE_BACKREF_WITH_LEVEL case OP_BACKREF_WITH_LEVEL: case OP_BACKREF_WITH_LEVEL_IC: - case OP_BACKREF_CHECK: case OP_BACKREF_CHECK_WITH_LEVEL: +#endif if (op->backref_general.num != 1) xfree(op->backref_general.ns); break; @@ -631,7 +633,7 @@ mmcl_add(MinMaxCharLen* to, MinMaxCharLen* add) to->min = distance_add(to->min, add->min); to->max = distance_add(to->max, add->max); - to->min_is_sure = add->min_is_sure != 0 && to->min_is_sure != 0; + to->min_is_sure = add->min_is_sure != FALSE && to->min_is_sure != FALSE; } static void @@ -656,8 +658,11 @@ static void mmcl_alt_merge(MinMaxCharLen* to, MinMaxCharLen* alt) { if (to->min > alt->min) { - to->min = alt->min; - if (alt->min_is_sure != 0) + to->min = alt->min; + to->min_is_sure = alt->min_is_sure; + } + else if (to->min == alt->min) { + if (alt->min_is_sure != FALSE) to->min_is_sure = TRUE; } @@ -840,7 +845,7 @@ node_char_len1(Node* node, regex_t* reg, MinMaxCharLen* ci, ScanEnv* env, en->min_char_len = ci->min; en->max_char_len = ci->max; NODE_STATUS_ADD(node, FIXED_CLEN); - if (ci->min_is_sure != 0) + if (ci->min_is_sure != FALSE) NODE_STATUS_ADD(node, FIXED_CLEN_MIN_SURE); } } @@ -882,15 +887,15 @@ node_char_len1(Node* node, regex_t* reg, MinMaxCharLen* ci, ScanEnv* env, } break; - case NODE_ANCHOR: + case NODE_GIMMICK: mmcl_set(ci, 0); - /* can't optimize look-behind if anchor exists. */ - ci->min_is_sure = FALSE; break; - case NODE_GIMMICK: + case NODE_ANCHOR: zero: mmcl_set(ci, 0); + /* can't optimize look-behind if anchor exists. */ + ci->min_is_sure = FALSE; break; case NODE_BACKREF: @@ -1082,6 +1087,9 @@ compile_call(CallNode* node, regex_t* reg, ScanEnv* env) if (r != 0) return r; COP(reg)->call.addr = 0; /* dummy addr. */ +#ifdef ONIG_DEBUG_MATCH_COUNTER + COP(reg)->call.called_mem = node->called_gnum; +#endif offset = COP_CURR_OFFSET_BYTES(reg, call.addr); r = unset_addr_list_add(env->unset_addr_list, offset, NODE_CALL_BODY(node)); @@ -1822,7 +1830,6 @@ compile_bag_memory_node(BagNode* node, regex_t* reg, ScanEnv* env) COP(reg)->memory_end.num = node->m.regnum; if (NODE_IS_CALLED(node)) { - if (r != 0) return r; r = add_op(reg, OP_RETURN); } #else @@ -2764,7 +2771,7 @@ compile_tree(Node* node, regex_t* reg, ScanEnv* env) static int make_named_capture_number_map(Node** plink, GroupNumMap* map, int* counter) { - int r = 0; + int r; Node* node = *plink; switch (NODE_TYPE(node)) { @@ -2772,17 +2779,17 @@ make_named_capture_number_map(Node** plink, GroupNumMap* map, int* counter) case NODE_ALT: do { r = make_named_capture_number_map(&(NODE_CAR(node)), map, counter); - } while (r == 0 && IS_NOT_NULL(node = NODE_CDR(node))); + } while (r >= 0 && IS_NOT_NULL(node = NODE_CDR(node))); + if (r < 0) return r; break; case NODE_QUANT: { Node** ptarget = &(NODE_BODY(node)); - Node* old = *ptarget; r = make_named_capture_number_map(ptarget, map, counter); - if (r != 0) return r; - if (*ptarget != old && NODE_TYPE(*ptarget) == NODE_QUANT) { - r = onig_reduce_nested_quantifier(node); + if (r < 0) return r; + if (r == 1 && NODE_TYPE(*ptarget) == NODE_QUANT) { + return onig_reduce_nested_quantifier(node); } } break; @@ -2796,41 +2803,48 @@ make_named_capture_number_map(Node** plink, GroupNumMap* map, int* counter) map[en->m.regnum].new_val = *counter; en->m.regnum = *counter; r = make_named_capture_number_map(&(NODE_BODY(node)), map, counter); + if (r < 0) return r; } else { *plink = NODE_BODY(node); NODE_BODY(node) = NULL_NODE; onig_node_free(node); r = make_named_capture_number_map(plink, map, counter); + if (r < 0) return r; + return 1; } } else if (en->type == BAG_IF_ELSE) { r = make_named_capture_number_map(&(NODE_BAG_BODY(en)), map, counter); - if (r != 0) return r; + if (r < 0) return r; if (IS_NOT_NULL(en->te.Then)) { r = make_named_capture_number_map(&(en->te.Then), map, counter); - if (r != 0) return r; + if (r < 0) return r; } if (IS_NOT_NULL(en->te.Else)) { r = make_named_capture_number_map(&(en->te.Else), map, counter); - if (r != 0) return r; + if (r < 0) return r; } } - else + else { r = make_named_capture_number_map(&(NODE_BODY(node)), map, counter); + if (r < 0) return r; + } } break; case NODE_ANCHOR: - if (IS_NOT_NULL(NODE_BODY(node))) + if (IS_NOT_NULL(NODE_BODY(node))) { r = make_named_capture_number_map(&(NODE_BODY(node)), map, counter); + if (r < 0) return r; + } break; default: break; } - return r; + return 0; } static int @@ -2982,7 +2996,7 @@ disable_noname_group_capture(Node** root, regex_t* reg, ScanEnv* env) } counter = 0; r = make_named_capture_number_map(root, map, &counter); - if (r != 0) return r; + if (r < 0) return r; r = renumber_backref_traverse(*root, map); if (r != 0) return r; @@ -3546,7 +3560,9 @@ check_node_in_look_behind(Node* node, int not, int* used) if (r != 0) break; if (en->type == BAG_MEMORY) { - if (NODE_IS_BACKREF(node) || NODE_IS_CALLED(node)) *used = TRUE; + if (NODE_IS_BACKREF(node) || NODE_IS_CALLED(node) + || NODE_IS_REFERENCED(node)) + *used = TRUE; } else if (en->type == BAG_IF_ELSE) { if (IS_NOT_NULL(en->te.Then)) { @@ -3978,6 +3994,7 @@ set_empty_repeat_node_trav(Node* node, Node* empty, ScanEnv* env) { BagNode* en = BAG_(node); + r = 0; if (en->type == BAG_MEMORY) { if (NODE_IS_BACKREF(node)) { if (IS_NOT_NULL(empty)) @@ -4484,7 +4501,7 @@ remove_from_list(Node* prev, Node* a) } static int -reduce_string_list(Node* node) +reduce_string_list(Node* node, OnigEncoding enc) { int r = 0; @@ -4515,43 +4532,70 @@ reduce_string_list(Node* node) } } else { - prev = NULL_NODE; + if (IS_NOT_NULL(prev)) { +#ifdef USE_CHECK_VALIDITY_OF_STRING_IN_TREE + StrNode* sn = STR_(prev); + if (! ONIGENC_IS_VALID_MBC_STRING(enc, sn->s, sn->end)) + return ONIGERR_INVALID_WIDE_CHAR_VALUE; +#endif + prev = NULL_NODE; + } + r = reduce_string_list(curr, enc); + if (r != 0) return r; prev_node = node; } node = next_node; } while (r == 0 && IS_NOT_NULL(node)); + +#ifdef USE_CHECK_VALIDITY_OF_STRING_IN_TREE + if (IS_NOT_NULL(prev)) { + StrNode* sn = STR_(prev); + if (! ONIGENC_IS_VALID_MBC_STRING(enc, sn->s, sn->end)) + return ONIGERR_INVALID_WIDE_CHAR_VALUE; + } +#endif } break; case NODE_ALT: do { - r = reduce_string_list(NODE_CAR(node)); + r = reduce_string_list(NODE_CAR(node), enc); } while (r == 0 && IS_NOT_NULL(node = NODE_CDR(node))); break; +#ifdef USE_CHECK_VALIDITY_OF_STRING_IN_TREE + case NODE_STRING: + { + StrNode* sn = STR_(node); + if (! ONIGENC_IS_VALID_MBC_STRING(enc, sn->s, sn->end)) + return ONIGERR_INVALID_WIDE_CHAR_VALUE; + } + break; +#endif + case NODE_ANCHOR: if (IS_NULL(NODE_BODY(node))) break; /* fall */ case NODE_QUANT: - r = reduce_string_list(NODE_BODY(node)); + r = reduce_string_list(NODE_BODY(node), enc); break; case NODE_BAG: { BagNode* en = BAG_(node); - r = reduce_string_list(NODE_BODY(node)); + r = reduce_string_list(NODE_BODY(node), enc); if (r != 0) return r; if (en->type == BAG_IF_ELSE) { if (IS_NOT_NULL(en->te.Then)) { - r = reduce_string_list(en->te.Then); + r = reduce_string_list(en->te.Then, enc); if (r != 0) return r; } if (IS_NOT_NULL(en->te.Else)) { - r = reduce_string_list(en->te.Else); + r = reduce_string_list(en->te.Else, enc); if (r != 0) return r; } } @@ -4723,7 +4767,7 @@ tune_look_behind(Node* node, regex_t* reg, int state, ScanEnv* env) return ONIGERR_INVALID_LOOK_BEHIND_PATTERN; } - if (ci.min == 0 && ci.min_is_sure != 0 && used == FALSE) { + if (ci.min == 0 && ci.min_is_sure != FALSE && used == FALSE) { if (an->type == ANCR_LOOK_BEHIND_NOT) r = onig_node_reset_fail(node); else @@ -4779,18 +4823,23 @@ tune_look_behind(Node* node, regex_t* reg, int state, ScanEnv* env) static int tune_next(Node* node, Node* next_node, regex_t* reg) { + int called; NodeType type; + called = FALSE; + retry: type = NODE_TYPE(node); if (type == NODE_QUANT) { QuantNode* qn = QUANT_(node); if (qn->greedy && IS_INFINITE_REPEAT(qn->upper)) { #ifdef USE_QUANT_PEEK_NEXT - Node* n = get_tree_head_literal(next_node, 1, reg); - /* '\0': for UTF-16BE etc... */ - if (IS_NOT_NULL(n) && STR_(n)->s[0] != '\0') { - qn->next_head_exact = n; + if (called == FALSE) { + Node* n = get_tree_head_literal(next_node, 1, reg); + /* '\0': for UTF-16BE etc... */ + if (IS_NOT_NULL(n) && STR_(n)->s[0] != '\0') { + qn->next_head_exact = n; + } } #endif /* automatic posseivation a*b ==> (?>a*)b */ @@ -4815,6 +4864,8 @@ tune_next(Node* node, Node* next_node, regex_t* reg) else if (type == NODE_BAG) { BagNode* en = BAG_(node); if (en->type == BAG_MEMORY) { + if (NODE_IS_CALLED(node)) + called = TRUE; node = NODE_BODY(node); goto retry; } @@ -4999,17 +5050,18 @@ unravel_cf_look_behind_add(Node** rlist, Node** rsn, { int r, i, found; - found = 0; + found = FALSE; for (i = 0; i < n; i++) { OnigCaseFoldCodeItem* item = items + i; if (item->byte_len == one_len) { if (item->code_len == 1) { - found = 1; + found = TRUE; + break; } } } - if (found == 0) { + if (found == FALSE) { r = unravel_cf_string_add(rlist, rsn, s, s + one_len, 0 /* flag */); } else { @@ -5073,6 +5125,7 @@ unravel_case_fold_string(Node* node, regex_t* reg, int state) one_len = (OnigLen )enclen(enc, p); if (n == 0) { q = p + one_len; + if (q > end) q = end; r = unravel_cf_string_add(&list, &sn, p, q, 0 /* flag */); if (r != 0) goto err; } @@ -5221,12 +5274,12 @@ quantifiers_memory_node_info(Node* node) __inline #endif static int -tune_call_node_call(CallNode* cn, ScanEnv* env, int state) +check_call_reference(CallNode* cn, ScanEnv* env, int state) { MemEnv* mem_env = SCANENV_MEMENV(env); if (cn->by_number != 0) { - int gnum = cn->group_num; + int gnum = cn->called_gnum; if (env->num_named > 0 && IS_SYNTAX_BV(env->syntax, ONIG_SYN_CAPTURE_ONLY_NAMED_GROUP) && @@ -5241,12 +5294,14 @@ tune_call_node_call(CallNode* cn, ScanEnv* env, int state) } set_call_attr: - NODE_CALL_BODY(cn) = mem_env[cn->group_num].mem_node; + NODE_CALL_BODY(cn) = mem_env[cn->called_gnum].mem_node; if (IS_NULL(NODE_CALL_BODY(cn))) { onig_scan_env_set_error_string(env, ONIGERR_UNDEFINED_NAME_REFERENCE, cn->name, cn->name_end); return ONIGERR_UNDEFINED_NAME_REFERENCE; } + + NODE_STATUS_ADD(NODE_CALL_BODY(cn), REFERENCED); } else { int *refs; @@ -5263,7 +5318,7 @@ tune_call_node_call(CallNode* cn, ScanEnv* env, int state) return ONIGERR_MULTIPLEX_DEFINITION_NAME_CALL; } else { - cn->group_num = refs[0]; + cn->called_gnum = refs[0]; goto set_call_attr; } } @@ -5396,7 +5451,7 @@ tune_call(Node* node, ScanEnv* env, int state) CALL_(node)->entry_count--; } - r = tune_call_node_call(CALL_(node), env, state); + r = check_call_reference(CALL_(node), env, state); break; default: @@ -6187,8 +6242,10 @@ concat_opt_exact(OptStr* to, OptStr* add, OnigEncoding enc) r = 1; /* 1:full */ break; } - for (j = 0; j < len && p < end; j++) + for (j = 0; j < len && p < end; j++) { + /* coverity[overrun-local] */ to->s[i++] = *p++; + } } to->len = i; @@ -6210,8 +6267,10 @@ concat_opt_exact_str(OptStr* to, UChar* s, UChar* end, OnigEncoding enc) for (i = to->len, p = s; p < end && i < OPT_EXACT_MAXLEN; ) { len = enclen(enc, p); if (i + len > OPT_EXACT_MAXLEN) break; - for (j = 0; j < len && p < end; j++) + for (j = 0; j < len && p < end; j++) { + /* coverity[overrun-local] */ to->s[i++] = *p++; + } } to->len = i; @@ -7229,19 +7288,10 @@ onig_compile(regex_t* reg, const UChar* pattern, const UChar* pattern_end, else reg->ops_used = 0; - reg->string_pool = 0; - reg->string_pool_end = 0; - reg->num_mem = 0; - reg->num_repeat = 0; - reg->num_empty_check = 0; - reg->repeat_range_alloc = 0; - reg->repeat_range = (RepeatRange* )NULL; - reg->empty_status_mem = 0; - r = onig_parse_tree(&root, pattern, pattern_end, reg, &scan_env); if (r != 0) goto err; - r = reduce_string_list(root); + r = reduce_string_list(root, reg->enc); if (r != 0) goto err; /* mixed use named group and no-named group */ @@ -7653,6 +7703,134 @@ onig_is_code_in_cc(OnigEncoding enc, OnigCodePoint code, CClassNode* cc) return onig_is_code_in_cc_len(len, code, cc); } +typedef struct { + int prec_read; + int look_behind; + int backref_with_level; + int call; +} SlowElementCount; + +static int +node_detect_can_be_slow(Node* node, SlowElementCount* ct) +{ + int r; + + r = 0; + switch (NODE_TYPE(node)) { + case NODE_LIST: + case NODE_ALT: + do { + r = node_detect_can_be_slow(NODE_CAR(node), ct); + if (r != 0) return r; + } while (IS_NOT_NULL(node = NODE_CDR(node))); + break; + + case NODE_QUANT: + r = node_detect_can_be_slow(NODE_BODY(node), ct); + break; + + case NODE_ANCHOR: + switch (ANCHOR_(node)->type) { + case ANCR_PREC_READ: + case ANCR_PREC_READ_NOT: + ct->prec_read++; + break; + case ANCR_LOOK_BEHIND: + case ANCR_LOOK_BEHIND_NOT: + ct->look_behind++; + break; + default: + break; + } + + if (ANCHOR_HAS_BODY(ANCHOR_(node))) + r = node_detect_can_be_slow(NODE_BODY(node), ct); + break; + + case NODE_BAG: + { + BagNode* en = BAG_(node); + + r = node_detect_can_be_slow(NODE_BODY(node), ct); + if (r != 0) return r; + + if (en->type == BAG_IF_ELSE) { + if (IS_NOT_NULL(en->te.Then)) { + r = node_detect_can_be_slow(en->te.Then, ct); + if (r != 0) return r; + } + if (IS_NOT_NULL(en->te.Else)) { + r = node_detect_can_be_slow(en->te.Else, ct); + if (r != 0) return r; + } + } + } + break; + +#ifdef USE_BACKREF_WITH_LEVEL + case NODE_BACKREF: + if (NODE_IS_NEST_LEVEL(node)) + ct->backref_with_level++; + break; +#endif + +#ifdef USE_CALL + case NODE_CALL: + ct->call++; + break; +#endif + + default: + break; + } + + return r; +} + +extern int +onig_detect_can_be_slow_pattern(const UChar* pattern, + const UChar* pattern_end, OnigOptionType option, OnigEncoding enc, + OnigSyntaxType* syntax) +{ + int r; + regex_t* reg; + Node* root; + ScanEnv scan_env; + SlowElementCount count; + + reg = (regex_t* )xmalloc(sizeof(regex_t)); + if (IS_NULL(reg)) return ONIGERR_MEMORY; + + r = onig_reg_init(reg, option, ONIGENC_CASE_FOLD_DEFAULT, enc, syntax); + if (r != 0) { + xfree(reg); + return r; + } + + root = 0; + r = onig_parse_tree(&root, pattern, pattern_end, reg, &scan_env); + if (r == 0) { + count.prec_read = 0; + count.look_behind = 0; + count.backref_with_level = 0; + count.call = 0; + + r = node_detect_can_be_slow(root, &count); + if (r == 0) { + int n = count.prec_read + count.look_behind + + count.backref_with_level + count.call; + r = n; + } + } + + if (IS_NOT_NULL(scan_env.mem_env_dynamic)) + xfree(scan_env.mem_env_dynamic); + + onig_node_free(root); + onig_free(reg); + return r; +} + #ifdef ONIG_DEBUG_PARSE @@ -7734,14 +7912,18 @@ print_indent_tree(FILE* f, Node* node, int indent) break; case NODE_CCLASS: +#define CCLASS_MBUF_MAX_OUTPUT_NUM 10 + fprintf(f, "", node); if (IS_NCCLASS_NOT(CCLASS_(node))) fputs(" not", f); if (CCLASS_(node)->mbuf) { BBuf* bbuf = CCLASS_(node)->mbuf; - for (i = 0; i < bbuf->used; i++) { + fprintf(f, " mbuf(%u) ", bbuf->used); + for (i = 0; i < bbuf->used && i < CCLASS_MBUF_MAX_OUTPUT_NUM; i++) { if (i > 0) fprintf(f, ","); fprintf(f, "%0x", bbuf->p[i]); } + if (i < bbuf->used) fprintf(f, "..."); } break; @@ -7822,6 +8004,11 @@ print_indent_tree(FILE* f, Node* node, int indent) if (i > 0) fputs(", ", f); fprintf(f, "%d", p[i]); } +#ifdef USE_BACKREF_WITH_LEVEL + if (NODE_IS_NEST_LEVEL(node)) { + fprintf(f, ", level: %d", br->nest_level); + } +#endif } break; @@ -7830,6 +8017,7 @@ print_indent_tree(FILE* f, Node* node, int indent) { CallNode* cn = CALL_(node); fprintf(f, "", node); + fprintf(f, " num: %d, name", cn->called_gnum); p_string(f, cn->name_end - cn->name, cn->name); } break; @@ -7881,6 +8069,8 @@ print_indent_tree(FILE* f, Node* node, int indent) fprintf(f, "memory:%d", BAG_(node)->m.regnum); if (NODE_IS_CALLED(node)) fprintf(f, ", called"); + else if (NODE_IS_REFERENCED(node)) + fprintf(f, ", referenced"); if (NODE_IS_FIXED_ADDR(node)) fprintf(f, ", fixed-addr"); break; -- cgit v1.2.3