diff options
Diffstat (limited to 'src/regcomp.c')
-rw-r--r-- | src/regcomp.c | 1955 |
1 files changed, 1207 insertions, 748 deletions
diff --git a/src/regcomp.c b/src/regcomp.c index c2c04a4..69d4b95 100644 --- a/src/regcomp.c +++ b/src/regcomp.c @@ -2,7 +2,7 @@ regcomp.c - Oniguruma (regular expression library) **********************************************************************/ /*- - * Copyright (c) 2002-2019 K.Kosako <sndgk393 AT ybb DOT ne DOT jp> + * Copyright (c) 2002-2019 K.Kosako * All rights reserved. * * Redistribution and use in source and binary forms, with or without @@ -224,17 +224,17 @@ ops_free(regex_t* reg) #endif switch (opcode) { - case OP_EXACTMBN: + case OP_STR_MBN: if (! is_in_string_pool(reg, op->exact_len_n.s)) xfree(op->exact_len_n.s); break; - case OP_EXACTN: case OP_EXACTMB2N: case OP_EXACTMB3N: case OP_EXACTN_IC: + case OP_STR_N: case OP_STR_MB2N: case OP_STR_MB3N: case OP_STR_N_IC: if (! is_in_string_pool(reg, op->exact_n.s)) xfree(op->exact_n.s); break; - case OP_EXACT1: case OP_EXACT2: case OP_EXACT3: case OP_EXACT4: - case OP_EXACT5: case OP_EXACTMB2N1: case OP_EXACTMB2N2: - case OP_EXACTMB2N3: case OP_EXACT1_IC: + case OP_STR_1: case OP_STR_2: case OP_STR_3: case OP_STR_4: + case OP_STR_5: case OP_STR_MB2N1: case OP_STR_MB2N2: + case OP_STR_MB2N3: case OP_STR_1_IC: break; case OP_CCLASS_NOT: case OP_CCLASS: @@ -298,17 +298,17 @@ ops_calc_size_of_string_pool(regex_t* reg) #endif switch (opcode) { - case OP_EXACTMBN: + case OP_STR_MBN: total += op->exact_len_n.len * op->exact_len_n.n; break; - case OP_EXACTN: - case OP_EXACTN_IC: + case OP_STR_N: + case OP_STR_N_IC: total += op->exact_n.n; break; - case OP_EXACTMB2N: + case OP_STR_MB2N: total += op->exact_n.n * 2; break; - case OP_EXACTMB3N: + case OP_STR_MB3N: total += op->exact_n.n * 3; break; @@ -349,15 +349,15 @@ ops_make_string_pool(regex_t* reg) #endif switch (opcode) { - case OP_EXACTMBN: + case OP_STR_MBN: len = op->exact_len_n.len * op->exact_len_n.n; xmemcpy(curr, op->exact_len_n.s, len); xfree(op->exact_len_n.s); op->exact_len_n.s = curr; curr += len; break; - case OP_EXACTN: - case OP_EXACTN_IC: + case OP_STR_N: + case OP_STR_N_IC: len = op->exact_n.n; copy: xmemcpy(curr, op->exact_n.s, len); @@ -365,11 +365,11 @@ ops_make_string_pool(regex_t* reg) op->exact_n.s = curr; curr += len; break; - case OP_EXACTMB2N: + case OP_STR_MB2N: len = op->exact_n.n * 2; goto copy; break; - case OP_EXACTMB3N: + case OP_STR_MB3N: len = op->exact_n.n * 3; goto copy; break; @@ -427,7 +427,7 @@ onig_positive_int_multiply(int x, int y) static void -swap_node(Node* a, Node* b) +node_swap(Node* a, Node* b) { Node c; @@ -452,6 +452,81 @@ swap_node(Node* a, Node* b) } } +static int +node_list_len(Node* list) +{ + int len; + + len = 1; + while (IS_NOT_NULL(NODE_CDR(list))) { + list = NODE_CDR(list); + len++; + } + + return len; +} + +static Node* +node_list_add(Node* list, Node* x) +{ + Node *n; + + n = onig_node_new_list(x, NULL); + if (IS_NULL(n)) return NULL_NODE; + + if (IS_NOT_NULL(list)) { + while (IS_NOT_NULL(NODE_CDR(list))) + list = NODE_CDR(list); + + NODE_CDR(list) = n; + } + + return n; +} + +static int +node_str_node_cat(Node* node, Node* add) +{ + int r; + + if (STR_(node)->flag != STR_(add)->flag) + return ONIGERR_TYPE_BUG; + + r = onig_node_str_cat(node, STR_(add)->s, STR_(add)->end); + if (r != 0) return r; + + if (NODE_STRING_IS_CASE_FOLD_MATCH(node)) + STR_(node)->case_min_len += STR_(add)->case_min_len; + + return 0; +} + +static int +node_str_cat_case_fold(Node* node, const UChar* s, const UChar* end, int case_min_len) +{ + int r; + + if (! NODE_STRING_IS_CASE_FOLD_MATCH(node)) + return ONIGERR_TYPE_BUG; + + r = onig_node_str_cat(node, s, end); + if (r != 0) return r; + + STR_(node)->case_min_len += case_min_len; + return 0; +} + +static void +node_conv_to_str_node(Node* node, int flag) +{ + NODE_SET_TYPE(node, NODE_STRING); + STR_(node)->flag = flag; + STR_(node)->s = STR_(node)->buf; + STR_(node)->end = STR_(node)->buf; + STR_(node)->capacity = 0; + STR_(node)->case_min_len = 0; +} + static OnigLen distance_add(OnigLen d1, OnigLen d2) { @@ -549,81 +624,108 @@ static int compile_length_tree(Node* node, regex_t* reg); static int compile_tree(Node* node, regex_t* reg, ScanEnv* env); -#define IS_NEED_STR_LEN_OP_EXACT(op) \ - ((op) == OP_EXACTN || (op) == OP_EXACTMB2N ||\ - (op) == OP_EXACTMB3N || (op) == OP_EXACTMBN || (op) == OP_EXACTN_IC) +#define IS_NEED_STR_LEN_OP(op) \ + ((op) == OP_STR_N || (op) == OP_STR_MB2N ||\ + (op) == OP_STR_MB3N || (op) == OP_STR_MBN || (op) == OP_STR_N_IC) static int -select_str_opcode(int mb_len, int str_len, int ignore_case) +select_str_opcode(int mb_len, int str_len) { int op; - if (ignore_case) { + switch (mb_len) { + case 1: switch (str_len) { - case 1: op = OP_EXACT1_IC; break; - default: op = OP_EXACTN_IC; break; + case 1: op = OP_STR_1; break; + case 2: op = OP_STR_2; break; + case 3: op = OP_STR_3; break; + case 4: op = OP_STR_4; break; + case 5: op = OP_STR_5; break; + default: op = OP_STR_N; break; } - } - else { - switch (mb_len) { - case 1: - switch (str_len) { - case 1: op = OP_EXACT1; break; - case 2: op = OP_EXACT2; break; - case 3: op = OP_EXACT3; break; - case 4: op = OP_EXACT4; break; - case 5: op = OP_EXACT5; break; - default: op = OP_EXACTN; break; - } - break; + break; - case 2: - switch (str_len) { - case 1: op = OP_EXACTMB2N1; break; - case 2: op = OP_EXACTMB2N2; break; - case 3: op = OP_EXACTMB2N3; break; - default: op = OP_EXACTMB2N; break; - } - break; + case 2: + switch (str_len) { + case 1: op = OP_STR_MB2N1; break; + case 2: op = OP_STR_MB2N2; break; + case 3: op = OP_STR_MB2N3; break; + default: op = OP_STR_MB2N; break; + } + break; - case 3: - op = OP_EXACTMB3N; - break; + case 3: + op = OP_STR_MB3N; + break; - default: - op = OP_EXACTMBN; - break; - } + default: + op = OP_STR_MBN; + break; } + return op; } static int -compile_tree_empty_check(Node* node, regex_t* reg, int empty_info, ScanEnv* env) +is_strict_real_node(Node* node) +{ + switch (NODE_TYPE(node)) { + case NODE_STRING: + { + StrNode* sn = STR_(node); + return (sn->end != sn->s); + } + break; + + case NODE_CCLASS: + case NODE_CTYPE: + return 1; + break; + + default: + return 0; + break; + } +} + +static int +compile_quant_body_with_empty_check(QuantNode* qn, regex_t* reg, ScanEnv* env) { int r; - int saved_num_null_check = reg->num_null_check; + int saved_num_empty_check; + int emptiness; + Node* body; - if (empty_info != BODY_IS_NOT_EMPTY) { + body = NODE_BODY((Node* )qn); + emptiness = qn->emptiness; + saved_num_empty_check = reg->num_empty_check; + + if (emptiness != BODY_IS_NOT_EMPTY) { r = add_op(reg, OP_EMPTY_CHECK_START); if (r != 0) return r; - COP(reg)->empty_check_start.mem = reg->num_null_check; /* NULL CHECK ID */ - reg->num_null_check++; + COP(reg)->empty_check_start.mem = reg->num_empty_check; /* NULL CHECK ID */ + reg->num_empty_check++; } - r = compile_tree(node, reg, env); + r = compile_tree(body, reg, env); if (r != 0) return r; - if (empty_info != BODY_IS_NOT_EMPTY) { - if (empty_info == BODY_IS_EMPTY) + if (emptiness != BODY_IS_NOT_EMPTY) { + if (emptiness == BODY_IS_EMPTY_POSSIBILITY) r = add_op(reg, OP_EMPTY_CHECK_END); - else if (empty_info == BODY_IS_EMPTY_MEM) - r = add_op(reg, OP_EMPTY_CHECK_END_MEMST); - else if (empty_info == BODY_IS_EMPTY_REC) + else if (emptiness == BODY_IS_EMPTY_POSSIBILITY_MEM) { + if (NODE_IS_EMPTY_STATUS_CHECK(qn) != 0) + r = add_op(reg, OP_EMPTY_CHECK_END_MEMST); + else + r = add_op(reg, OP_EMPTY_CHECK_END); + } +#ifdef USE_CALL + else if (emptiness == BODY_IS_EMPTY_POSSIBILITY_REC) r = add_op(reg, OP_EMPTY_CHECK_END_MEMST_PUSH); +#endif if (r != 0) return r; - COP(reg)->empty_check_end.mem = saved_num_null_check; /* NULL CHECK ID */ + COP(reg)->empty_check_end.mem = saved_num_empty_check; /* NULL CHECK ID */ } return r; } @@ -660,14 +762,13 @@ compile_tree_n_times(Node* node, int n, regex_t* reg, ScanEnv* env) static int add_compile_string_length(UChar* s ARG_UNUSED, int mb_len, int str_len, - regex_t* reg ARG_UNUSED, int ignore_case) + regex_t* reg ARG_UNUSED) { return 1; } static int -add_compile_string(UChar* s, int mb_len, int str_len, - regex_t* reg, int ignore_case) +add_compile_string(UChar* s, int mb_len, int str_len, regex_t* reg) { int op; int r; @@ -675,14 +776,14 @@ add_compile_string(UChar* s, int mb_len, int str_len, UChar* p; UChar* end; - op = select_str_opcode(mb_len, str_len, ignore_case); + op = select_str_opcode(mb_len, str_len); r = add_op(reg, op); if (r != 0) return r; byte_len = mb_len * str_len; end = s + byte_len; - if (op == OP_EXACTMBN) { + if (op == OP_STR_MBN) { p = onigenc_strdup(reg->enc, s, end); CHECK_NULL_RETURN_MEMERR(p); @@ -690,11 +791,11 @@ add_compile_string(UChar* s, int mb_len, int str_len, COP(reg)->exact_len_n.n = str_len; COP(reg)->exact_len_n.s = p; } - else if (IS_NEED_STR_LEN_OP_EXACT(op)) { + else if (IS_NEED_STR_LEN_OP(op)) { p = onigenc_strdup(reg->enc, s, end); CHECK_NULL_RETURN_MEMERR(p); - if (op == OP_EXACTN_IC) + if (op == OP_STR_N_IC) COP(reg)->exact_n.n = byte_len; else COP(reg)->exact_n.n = str_len; @@ -702,8 +803,8 @@ add_compile_string(UChar* s, int mb_len, int str_len, COP(reg)->exact_n.s = p; } else { + xmemset(COP(reg)->exact.s, 0, sizeof(COP(reg)->exact.s)); xmemcpy(COP(reg)->exact.s, s, (size_t )byte_len); - COP(reg)->exact.s[byte_len] = '\0'; } return 0; @@ -712,7 +813,7 @@ add_compile_string(UChar* s, int mb_len, int str_len, static int compile_length_string_node(Node* node, regex_t* reg) { - int rlen, r, len, prev_len, slen, ambig; + int rlen, r, len, prev_len, slen; UChar *p, *prev; StrNode* sn; OnigEncoding enc = reg->enc; @@ -721,7 +822,7 @@ compile_length_string_node(Node* node, regex_t* reg) if (sn->end <= sn->s) return 0; - ambig = NODE_STRING_IS_AMBIG(node); + if (NODE_STRING_IS_CASE_FOLD_MATCH(node) != 0) return 1; p = prev = sn->s; prev_len = enclen(enc, p); @@ -735,7 +836,7 @@ compile_length_string_node(Node* node, regex_t* reg) slen++; } else { - r = add_compile_string_length(prev, prev_len, slen, reg, ambig); + r = add_compile_string_length(prev, prev_len, slen, reg); rlen += r; prev = p; slen = 1; @@ -744,25 +845,59 @@ compile_length_string_node(Node* node, regex_t* reg) p += len; } - r = add_compile_string_length(prev, prev_len, slen, reg, ambig); + r = add_compile_string_length(prev, prev_len, slen, reg); rlen += r; return rlen; } static int -compile_length_string_raw_node(StrNode* sn, regex_t* reg) +compile_length_string_crude_node(StrNode* sn, regex_t* reg) { if (sn->end <= sn->s) return 0; return add_compile_string_length(sn->s, 1 /* sb */, (int )(sn->end - sn->s), - reg, 0); + reg); +} + +static int +compile_ambig_string_node(Node* node, regex_t* reg) +{ + int r; + int len; + int byte_len; + UChar* p; + StrNode* sn; + OnigEncoding enc = reg->enc; + + sn = STR_(node); + len = enclen(enc, sn->s); + byte_len = (int )(sn->end - sn->s); + if (len == byte_len) { + r = add_op(reg, OP_STR_1_IC); + if (r != 0) return r; + + xmemset(COP(reg)->exact.s, 0, sizeof(COP(reg)->exact.s)); + xmemcpy(COP(reg)->exact.s, sn->s, (size_t )byte_len); + } + else { + r = add_op(reg, OP_STR_N_IC); + if (r != 0) return r; + + p = onigenc_strdup(enc, sn->s, sn->end); + CHECK_NULL_RETURN_MEMERR(p); + + COP(reg)->exact_n.s = p; + COP(reg)->exact_n.n = byte_len; + } + + return 0; } static int compile_string_node(Node* node, regex_t* reg) { - int r, len, prev_len, slen, ambig; + int r, len, prev_len, slen; UChar *p, *prev, *end; StrNode* sn; OnigEncoding enc = reg->enc; @@ -772,7 +907,9 @@ compile_string_node(Node* node, regex_t* reg) return 0; end = sn->end; - ambig = NODE_STRING_IS_AMBIG(node); + if (NODE_STRING_IS_CASE_FOLD_MATCH(node) != 0) { + return compile_ambig_string_node(node, reg); + } p = prev = sn->s; prev_len = enclen(enc, p); @@ -785,7 +922,7 @@ compile_string_node(Node* node, regex_t* reg) slen++; } else { - r = add_compile_string(prev, prev_len, slen, reg, ambig); + r = add_compile_string(prev, prev_len, slen, reg); if (r != 0) return r; prev = p; @@ -796,16 +933,16 @@ compile_string_node(Node* node, regex_t* reg) p += len; } - return add_compile_string(prev, prev_len, slen, reg, ambig); + return add_compile_string(prev, prev_len, slen, reg); } static int -compile_string_raw_node(StrNode* sn, regex_t* reg) +compile_string_crude_node(StrNode* sn, regex_t* reg) { if (sn->end <= sn->s) return 0; - return add_compile_string(sn->s, 1 /* sb */, (int )(sn->end - sn->s), reg, 0); + return add_compile_string(sn->s, 1 /* sb */, (int )(sn->end - sn->s), reg); } static void* @@ -869,15 +1006,27 @@ compile_cclass_node(CClassNode* cc, regex_t* reg) return 0; } +static void +set_addr_in_repeat_range(regex_t* reg) +{ + int i; + + for (i = 0; i < reg->num_repeat; i++) { + RepeatRange* p = reg->repeat_range + i; + int offset = p->u.offset; + p->u.pcode = reg->ops + offset; + } +} + static int -entry_repeat_range(regex_t* reg, int id, int lower, int upper) +entry_repeat_range(regex_t* reg, int id, int lower, int upper, int ops_index) { #define REPEAT_RANGE_ALLOC 4 - OnigRepeatRange* p; + RepeatRange* p; if (reg->repeat_range_alloc == 0) { - p = (OnigRepeatRange* )xmalloc(sizeof(OnigRepeatRange) * REPEAT_RANGE_ALLOC); + p = (RepeatRange* )xmalloc(sizeof(RepeatRange) * REPEAT_RANGE_ALLOC); CHECK_NULL_RETURN_MEMERR(p); reg->repeat_range = p; reg->repeat_range_alloc = REPEAT_RANGE_ALLOC; @@ -885,7 +1034,7 @@ entry_repeat_range(regex_t* reg, int id, int lower, int upper) else if (reg->repeat_range_alloc <= id) { int n; n = reg->repeat_range_alloc + REPEAT_RANGE_ALLOC; - p = (OnigRepeatRange* )xrealloc(reg->repeat_range, sizeof(OnigRepeatRange) * n); + p = (RepeatRange* )xrealloc(reg->repeat_range, sizeof(RepeatRange) * n); CHECK_NULL_RETURN_MEMERR(p); reg->repeat_range = p; reg->repeat_range_alloc = n; @@ -894,13 +1043,14 @@ entry_repeat_range(regex_t* reg, int id, int lower, int upper) p = reg->repeat_range; } - p[id].lower = lower; - p[id].upper = (IS_REPEAT_INFINITE(upper) ? 0x7fffffff : upper); + p[id].lower = lower; + p[id].upper = (IS_INFINITE_REPEAT(upper) ? 0x7fffffff : upper); + p[id].u.offset = ops_index; return 0; } static int -compile_range_repeat_node(QuantNode* qn, int target_len, int empty_info, +compile_range_repeat_node(QuantNode* qn, int target_len, int emptiness, regex_t* reg, ScanEnv* env) { int r; @@ -910,24 +1060,16 @@ compile_range_repeat_node(QuantNode* qn, int target_len, int empty_info, if (r != 0) return r; COP(reg)->repeat.id = num_repeat; - COP(reg)->repeat.addr = SIZE_INC_OP + target_len + SIZE_OP_REPEAT_INC; + COP(reg)->repeat.addr = SIZE_INC + target_len + OPSIZE_REPEAT_INC; - r = entry_repeat_range(reg, num_repeat, qn->lower, qn->upper); + r = entry_repeat_range(reg, num_repeat, qn->lower, qn->upper, + COP_CURR_OFFSET(reg) + OPSIZE_REPEAT); if (r != 0) return r; - r = compile_tree_empty_check(NODE_QUANT_BODY(qn), reg, empty_info, env); + r = compile_quant_body_with_empty_check(qn, reg, env); if (r != 0) return r; - if ( -#ifdef USE_CALL - NODE_IS_IN_MULTI_ENTRY(qn) || -#endif - NODE_IS_IN_REAL_REPEAT(qn)) { - r = add_op(reg, qn->greedy ? OP_REPEAT_INC_SG : OP_REPEAT_INC_NG_SG); - } - else { - r = add_op(reg, qn->greedy ? OP_REPEAT_INC : OP_REPEAT_INC_NG); - } + r = add_op(reg, qn->greedy ? OP_REPEAT_INC : OP_REPEAT_INC_NG); if (r != 0) return r; COP(reg)->repeat_inc.id = num_repeat; @@ -937,7 +1079,7 @@ compile_range_repeat_node(QuantNode* qn, int target_len, int empty_info, static int is_anychar_infinite_greedy(QuantNode* qn) { - if (qn->greedy && IS_REPEAT_INFINITE(qn->upper) && + if (qn->greedy && IS_INFINITE_REPEAT(qn->upper) && NODE_IS_ANYCHAR(NODE_QUANT_BODY(qn))) return 1; else @@ -951,8 +1093,8 @@ static int compile_length_quantifier_node(QuantNode* qn, regex_t* reg) { int len, mod_tlen; - int infinite = IS_REPEAT_INFINITE(qn->upper); - enum BodyEmpty empty_info = qn->empty_info; + int infinite = IS_INFINITE_REPEAT(qn->upper); + enum BodyEmptyType emptiness = qn->emptiness; int tlen = compile_length_tree(NODE_QUANT_BODY(qn), reg); if (tlen < 0) return tlen; @@ -963,22 +1105,21 @@ compile_length_quantifier_node(QuantNode* qn, regex_t* reg) if (qn->lower <= 1 || int_multiply_cmp(tlen, qn->lower, QUANTIFIER_EXPAND_LIMIT_SIZE) <= 0) { if (IS_NOT_NULL(qn->next_head_exact)) - return SIZE_OP_ANYCHAR_STAR_PEEK_NEXT + tlen * qn->lower; + return OPSIZE_ANYCHAR_STAR_PEEK_NEXT + tlen * qn->lower; else - return SIZE_OP_ANYCHAR_STAR + tlen * qn->lower; + return OPSIZE_ANYCHAR_STAR + tlen * qn->lower; } } - if (empty_info == BODY_IS_NOT_EMPTY) - mod_tlen = tlen; - else - mod_tlen = tlen + (SIZE_OP_EMPTY_CHECK_START + SIZE_OP_EMPTY_CHECK_END); + mod_tlen = tlen; + if (emptiness != BODY_IS_NOT_EMPTY) + mod_tlen += OPSIZE_EMPTY_CHECK_START + OPSIZE_EMPTY_CHECK_END; if (infinite && (qn->lower <= 1 || int_multiply_cmp(tlen, qn->lower, QUANTIFIER_EXPAND_LIMIT_SIZE) <= 0)) { if (qn->lower == 1 && tlen > QUANTIFIER_EXPAND_LIMIT_SIZE) { - len = SIZE_OP_JUMP; + len = OPSIZE_JUMP; } else { len = tlen * qn->lower; @@ -987,36 +1128,36 @@ compile_length_quantifier_node(QuantNode* qn, regex_t* reg) if (qn->greedy) { #ifdef USE_OP_PUSH_OR_JUMP_EXACT if (IS_NOT_NULL(qn->head_exact)) - len += SIZE_OP_PUSH_OR_JUMP_EXACT1 + mod_tlen + SIZE_OP_JUMP; + len += OPSIZE_PUSH_OR_JUMP_EXACT1 + mod_tlen + OPSIZE_JUMP; else #endif if (IS_NOT_NULL(qn->next_head_exact)) - len += SIZE_OP_PUSH_IF_PEEK_NEXT + mod_tlen + SIZE_OP_JUMP; + len += OPSIZE_PUSH_IF_PEEK_NEXT + mod_tlen + OPSIZE_JUMP; else - len += SIZE_OP_PUSH + mod_tlen + SIZE_OP_JUMP; + len += OPSIZE_PUSH + mod_tlen + OPSIZE_JUMP; } else - len += SIZE_OP_JUMP + mod_tlen + SIZE_OP_PUSH; + len += OPSIZE_JUMP + mod_tlen + OPSIZE_PUSH; } else if (qn->upper == 0) { - if (qn->is_refered != 0) { /* /(?<n>..){0}/ */ - len = SIZE_OP_JUMP + tlen; + if (qn->include_referred != 0) { /* /(?<n>..){0}/ */ + len = OPSIZE_JUMP + tlen; } else len = 0; } else if (!infinite && qn->greedy && (qn->upper == 1 || - int_multiply_cmp(tlen + SIZE_OP_PUSH, qn->upper, + int_multiply_cmp(tlen + OPSIZE_PUSH, qn->upper, QUANTIFIER_EXPAND_LIMIT_SIZE) <= 0)) { len = tlen * qn->lower; - len += (SIZE_OP_PUSH + tlen) * (qn->upper - qn->lower); + len += (OPSIZE_PUSH + tlen) * (qn->upper - qn->lower); } else if (!qn->greedy && qn->upper == 1 && qn->lower == 0) { /* '??' */ - len = SIZE_OP_PUSH + SIZE_OP_JUMP + tlen; + len = OPSIZE_PUSH + OPSIZE_JUMP + tlen; } else { - len = SIZE_OP_REPEAT_INC + mod_tlen + SIZE_OP_REPEAT; + len = OPSIZE_REPEAT_INC + mod_tlen + OPSIZE_REPEAT; } return len; @@ -1026,8 +1167,8 @@ static int compile_quantifier_node(QuantNode* qn, regex_t* reg, ScanEnv* env) { int i, r, mod_tlen; - int infinite = IS_REPEAT_INFINITE(qn->upper); - enum BodyEmpty empty_info = qn->empty_info; + int infinite = IS_INFINITE_REPEAT(qn->upper); + enum BodyEmptyType emptiness = qn->emptiness; int tlen = compile_length_tree(NODE_QUANT_BODY(qn), reg); if (tlen < 0) return tlen; @@ -1055,10 +1196,9 @@ compile_quantifier_node(QuantNode* qn, regex_t* reg, ScanEnv* env) } } - if (empty_info == BODY_IS_NOT_EMPTY) - mod_tlen = tlen; - else - mod_tlen = tlen + (SIZE_OP_EMPTY_CHECK_START + SIZE_OP_EMPTY_CHECK_END); + mod_tlen = tlen; + if (emptiness != BODY_IS_NOT_EMPTY) + mod_tlen += OPSIZE_EMPTY_CHECK_START + OPSIZE_EMPTY_CHECK_END; if (infinite && (qn->lower <= 1 || @@ -1071,16 +1211,16 @@ compile_quantifier_node(QuantNode* qn, regex_t* reg, ScanEnv* env) if (qn->greedy) { #ifdef USE_OP_PUSH_OR_JUMP_EXACT if (IS_NOT_NULL(qn->head_exact)) - COP(reg)->jump.addr = SIZE_OP_PUSH_OR_JUMP_EXACT1 + SIZE_INC_OP; + COP(reg)->jump.addr = OPSIZE_PUSH_OR_JUMP_EXACT1 + SIZE_INC; else #endif if (IS_NOT_NULL(qn->next_head_exact)) - COP(reg)->jump.addr = SIZE_OP_PUSH_IF_PEEK_NEXT + SIZE_INC_OP; + COP(reg)->jump.addr = OPSIZE_PUSH_IF_PEEK_NEXT + SIZE_INC; else - COP(reg)->jump.addr = SIZE_OP_PUSH + SIZE_INC_OP; + COP(reg)->jump.addr = OPSIZE_PUSH + SIZE_INC; } else { - COP(reg)->jump.addr = SIZE_OP_JUMP + SIZE_INC_OP; + COP(reg)->jump.addr = OPSIZE_JUMP + SIZE_INC; } } else { @@ -1093,36 +1233,36 @@ compile_quantifier_node(QuantNode* qn, regex_t* reg, ScanEnv* env) if (IS_NOT_NULL(qn->head_exact)) { r = add_op(reg, OP_PUSH_OR_JUMP_EXACT1); if (r != 0) return r; - COP(reg)->push_or_jump_exact1.addr = SIZE_INC_OP + mod_tlen + SIZE_OP_JUMP; + COP(reg)->push_or_jump_exact1.addr = SIZE_INC + mod_tlen + OPSIZE_JUMP; COP(reg)->push_or_jump_exact1.c = STR_(qn->head_exact)->s[0]; - r = compile_tree_empty_check(NODE_QUANT_BODY(qn), reg, empty_info, env); + r = compile_quant_body_with_empty_check(qn, reg, env); if (r != 0) return r; - addr = -(mod_tlen + (int )SIZE_OP_PUSH_OR_JUMP_EXACT1); + addr = -(mod_tlen + (int )OPSIZE_PUSH_OR_JUMP_EXACT1); } else #endif if (IS_NOT_NULL(qn->next_head_exact)) { r = add_op(reg, OP_PUSH_IF_PEEK_NEXT); if (r != 0) return r; - COP(reg)->push_if_peek_next.addr = SIZE_INC_OP + mod_tlen + SIZE_OP_JUMP; + COP(reg)->push_if_peek_next.addr = SIZE_INC + mod_tlen + OPSIZE_JUMP; COP(reg)->push_if_peek_next.c = STR_(qn->next_head_exact)->s[0]; - r = compile_tree_empty_check(NODE_QUANT_BODY(qn), reg, empty_info, env); + r = compile_quant_body_with_empty_check(qn, reg, env); if (r != 0) return r; - addr = -(mod_tlen + (int )SIZE_OP_PUSH_IF_PEEK_NEXT); + addr = -(mod_tlen + (int )OPSIZE_PUSH_IF_PEEK_NEXT); } else { r = add_op(reg, OP_PUSH); if (r != 0) return r; - COP(reg)->push.addr = SIZE_INC_OP + mod_tlen + SIZE_OP_JUMP; + COP(reg)->push.addr = SIZE_INC + mod_tlen + OPSIZE_JUMP; - r = compile_tree_empty_check(NODE_QUANT_BODY(qn), reg, empty_info, env); + r = compile_quant_body_with_empty_check(qn, reg, env); if (r != 0) return r; - addr = -(mod_tlen + (int )SIZE_OP_PUSH); + addr = -(mod_tlen + (int )OPSIZE_PUSH); } r = add_op(reg, OP_JUMP); @@ -1132,9 +1272,9 @@ compile_quantifier_node(QuantNode* qn, regex_t* reg, ScanEnv* env) else { r = add_op(reg, OP_JUMP); if (r != 0) return r; - COP(reg)->jump.addr = mod_tlen + SIZE_INC_OP; + COP(reg)->jump.addr = mod_tlen + SIZE_INC; - r = compile_tree_empty_check(NODE_QUANT_BODY(qn), reg, empty_info, env); + r = compile_quant_body_with_empty_check(qn, reg, env); if (r != 0) return r; r = add_op(reg, OP_PUSH); @@ -1143,10 +1283,10 @@ compile_quantifier_node(QuantNode* qn, regex_t* reg, ScanEnv* env) } } else if (qn->upper == 0) { - if (qn->is_refered != 0) { /* /(?<n>..){0}/ */ + if (qn->include_referred != 0) { /* /(?<n>..){0}/ */ r = add_op(reg, OP_JUMP); if (r != 0) return r; - COP(reg)->jump.addr = tlen + SIZE_INC_OP; + COP(reg)->jump.addr = tlen + SIZE_INC; r = compile_tree(NODE_QUANT_BODY(qn), reg, env); } @@ -1157,7 +1297,7 @@ compile_quantifier_node(QuantNode* qn, regex_t* reg, ScanEnv* env) } else if (! infinite && qn->greedy && (qn->upper == 1 || - int_multiply_cmp(tlen + SIZE_OP_PUSH, qn->upper, + int_multiply_cmp(tlen + OPSIZE_PUSH, qn->upper, QUANTIFIER_EXPAND_LIMIT_SIZE) <= 0)) { int n = qn->upper - qn->lower; @@ -1165,7 +1305,7 @@ compile_quantifier_node(QuantNode* qn, regex_t* reg, ScanEnv* env) if (r != 0) return r; for (i = 0; i < n; i++) { - int v = onig_positive_int_multiply(n - i, tlen + SIZE_OP_PUSH); + int v = onig_positive_int_multiply(n - i, tlen + OPSIZE_PUSH); if (v < 0) return ONIGERR_TOO_BIG_NUMBER_FOR_REPEAT_RANGE; r = add_op(reg, OP_PUSH); @@ -1179,16 +1319,16 @@ compile_quantifier_node(QuantNode* qn, regex_t* reg, ScanEnv* env) else if (! qn->greedy && qn->upper == 1 && qn->lower == 0) { /* '??' */ r = add_op(reg, OP_PUSH); if (r != 0) return r; - COP(reg)->push.addr = SIZE_INC_OP + SIZE_OP_JUMP; + COP(reg)->push.addr = SIZE_INC + OPSIZE_JUMP; r = add_op(reg, OP_JUMP); if (r != 0) return r; - COP(reg)->jump.addr = tlen + SIZE_INC_OP; + COP(reg)->jump.addr = tlen + SIZE_INC; r = compile_tree(NODE_QUANT_BODY(qn), reg, env); } else { - r = compile_range_repeat_node(qn, mod_tlen, empty_info, reg, env); + r = compile_range_repeat_node(qn, mod_tlen, emptiness, reg, env); } return r; } @@ -1240,40 +1380,40 @@ compile_length_bag_node(BagNode* node, regex_t* reg) #ifdef USE_CALL if (node->m.regnum == 0 && NODE_IS_CALLED(node)) { - len = tlen + SIZE_OP_CALL + SIZE_OP_JUMP + SIZE_OP_RETURN; + len = tlen + OPSIZE_CALL + OPSIZE_JUMP + OPSIZE_RETURN; return len; } if (NODE_IS_CALLED(node)) { - len = SIZE_OP_MEMORY_START_PUSH + tlen - + SIZE_OP_CALL + SIZE_OP_JUMP + SIZE_OP_RETURN; - if (MEM_STATUS_AT0(reg->bt_mem_end, node->m.regnum)) + len = OPSIZE_MEM_START_PUSH + tlen + + OPSIZE_CALL + OPSIZE_JUMP + OPSIZE_RETURN; + if (MEM_STATUS_AT0(reg->push_mem_end, node->m.regnum)) len += (NODE_IS_RECURSION(node) - ? SIZE_OP_MEMORY_END_PUSH_REC : SIZE_OP_MEMORY_END_PUSH); + ? OPSIZE_MEM_END_PUSH_REC : OPSIZE_MEM_END_PUSH); else len += (NODE_IS_RECURSION(node) - ? SIZE_OP_MEMORY_END_REC : SIZE_OP_MEMORY_END); + ? OPSIZE_MEM_END_REC : OPSIZE_MEM_END); } else if (NODE_IS_RECURSION(node)) { - len = SIZE_OP_MEMORY_START_PUSH; - len += tlen + (MEM_STATUS_AT0(reg->bt_mem_end, node->m.regnum) - ? SIZE_OP_MEMORY_END_PUSH_REC : SIZE_OP_MEMORY_END_REC); + len = OPSIZE_MEM_START_PUSH; + len += tlen + (MEM_STATUS_AT0(reg->push_mem_end, node->m.regnum) + ? OPSIZE_MEM_END_PUSH_REC : OPSIZE_MEM_END_REC); } else #endif { - if (MEM_STATUS_AT0(reg->bt_mem_start, node->m.regnum)) - len = SIZE_OP_MEMORY_START_PUSH; + if (MEM_STATUS_AT0(reg->push_mem_start, node->m.regnum)) + len = OPSIZE_MEM_START_PUSH; else - len = SIZE_OP_MEMORY_START; + len = OPSIZE_MEM_START; - len += tlen + (MEM_STATUS_AT0(reg->bt_mem_end, node->m.regnum) - ? SIZE_OP_MEMORY_END_PUSH : SIZE_OP_MEMORY_END); + len += tlen + (MEM_STATUS_AT0(reg->push_mem_end, node->m.regnum) + ? OPSIZE_MEM_END_PUSH : OPSIZE_MEM_END); } break; case BAG_STOP_BACKTRACK: - if (NODE_IS_STOP_BT_SIMPLE_REPEAT(node)) { + if (NODE_IS_STRICT_REAL_REPEAT(node)) { int v; QuantNode* qn; @@ -1283,10 +1423,10 @@ compile_length_bag_node(BagNode* node, regex_t* reg) v = onig_positive_int_multiply(qn->lower, tlen); if (v < 0) return ONIGERR_TOO_BIG_NUMBER_FOR_REPEAT_RANGE; - len = v + SIZE_OP_PUSH + tlen + SIZE_OP_POP_OUT + SIZE_OP_JUMP; + len = v + OPSIZE_PUSH + tlen + OPSIZE_POP_OUT + OPSIZE_JUMP; } else { - len = SIZE_OP_ATOMIC_START + tlen + SIZE_OP_ATOMIC_END; + len = OPSIZE_ATOMIC_START + tlen + OPSIZE_ATOMIC_END; } break; @@ -1298,8 +1438,8 @@ compile_length_bag_node(BagNode* node, regex_t* reg) len = compile_length_tree(cond, reg); if (len < 0) return len; - len += SIZE_OP_PUSH; - len += SIZE_OP_ATOMIC_START + SIZE_OP_ATOMIC_END; + len += OPSIZE_PUSH; + len += OPSIZE_ATOMIC_START + OPSIZE_ATOMIC_END; if (IS_NOT_NULL(Then)) { tlen = compile_length_tree(Then, reg); @@ -1307,8 +1447,9 @@ compile_length_bag_node(BagNode* node, regex_t* reg) len += tlen; } + len += OPSIZE_JUMP + OPSIZE_ATOMIC_END; + if (IS_NOT_NULL(Else)) { - len += SIZE_OP_JUMP; tlen = compile_length_tree(Else, reg); if (tlen < 0) return tlen; len += tlen; @@ -1331,24 +1472,25 @@ static int compile_bag_memory_node(BagNode* node, regex_t* reg, ScanEnv* env) { int r; - int len; #ifdef USE_CALL if (NODE_IS_CALLED(node)) { + int len; + r = add_op(reg, OP_CALL); if (r != 0) return r; - node->m.called_addr = COP_CURR_OFFSET(reg) + 1 + SIZE_OP_JUMP; + node->m.called_addr = COP_CURR_OFFSET(reg) + 1 + OPSIZE_JUMP; NODE_STATUS_ADD(node, ADDR_FIXED); COP(reg)->call.addr = (int )node->m.called_addr; if (node->m.regnum == 0) { len = compile_length_tree(NODE_BAG_BODY(node), reg); - len += SIZE_OP_RETURN; + len += OPSIZE_RETURN; r = add_op(reg, OP_JUMP); if (r != 0) return r; - COP(reg)->jump.addr = len + SIZE_INC_OP; + COP(reg)->jump.addr = len + SIZE_INC; r = compile_tree(NODE_BAG_BODY(node), reg, env); if (r != 0) return r; @@ -1358,25 +1500,24 @@ compile_bag_memory_node(BagNode* node, regex_t* reg, ScanEnv* env) } else { len = compile_length_tree(NODE_BAG_BODY(node), reg); - len += (SIZE_OP_MEMORY_START_PUSH + SIZE_OP_RETURN); - if (MEM_STATUS_AT0(reg->bt_mem_end, node->m.regnum)) + len += (OPSIZE_MEM_START_PUSH + OPSIZE_RETURN); + if (MEM_STATUS_AT0(reg->push_mem_end, node->m.regnum)) len += (NODE_IS_RECURSION(node) - ? SIZE_OP_MEMORY_END_PUSH_REC : SIZE_OP_MEMORY_END_PUSH); + ? OPSIZE_MEM_END_PUSH_REC : OPSIZE_MEM_END_PUSH); else - len += (NODE_IS_RECURSION(node) - ? SIZE_OP_MEMORY_END_REC : SIZE_OP_MEMORY_END); + len += (NODE_IS_RECURSION(node) ? OPSIZE_MEM_END_REC : OPSIZE_MEM_END); r = add_op(reg, OP_JUMP); if (r != 0) return r; - COP(reg)->jump.addr = len + SIZE_INC_OP; + COP(reg)->jump.addr = len + SIZE_INC; } } #endif - if (MEM_STATUS_AT0(reg->bt_mem_start, node->m.regnum)) - r = add_op(reg, OP_MEMORY_START_PUSH); + if (MEM_STATUS_AT0(reg->push_mem_start, node->m.regnum)) + r = add_op(reg, OP_MEM_START_PUSH); else - r = add_op(reg, OP_MEMORY_START); + r = add_op(reg, OP_MEM_START); if (r != 0) return r; COP(reg)->memory_start.num = node->m.regnum; @@ -1384,11 +1525,11 @@ compile_bag_memory_node(BagNode* node, regex_t* reg, ScanEnv* env) if (r != 0) return r; #ifdef USE_CALL - if (MEM_STATUS_AT0(reg->bt_mem_end, node->m.regnum)) + if (MEM_STATUS_AT0(reg->push_mem_end, node->m.regnum)) r = add_op(reg, (NODE_IS_RECURSION(node) - ? OP_MEMORY_END_PUSH_REC : OP_MEMORY_END_PUSH)); + ? OP_MEM_END_PUSH_REC : OP_MEM_END_PUSH)); else - r = add_op(reg, (NODE_IS_RECURSION(node) ? OP_MEMORY_END_REC : OP_MEMORY_END)); + r = add_op(reg, (NODE_IS_RECURSION(node) ? OP_MEM_END_REC : OP_MEM_END)); if (r != 0) return r; COP(reg)->memory_end.num = node->m.regnum; @@ -1397,10 +1538,10 @@ compile_bag_memory_node(BagNode* node, regex_t* reg, ScanEnv* env) r = add_op(reg, OP_RETURN); } #else - if (MEM_STATUS_AT0(reg->bt_mem_end, node->m.regnum)) - r = add_op(reg, OP_MEMORY_END_PUSH); + if (MEM_STATUS_AT0(reg->push_mem_end, node->m.regnum)) + r = add_op(reg, OP_MEM_END_PUSH); else - r = add_op(reg, OP_MEMORY_END); + r = add_op(reg, OP_MEM_END); if (r != 0) return r; COP(reg)->memory_end.num = node->m.regnum; #endif @@ -1423,7 +1564,7 @@ compile_bag_node(BagNode* node, regex_t* reg, ScanEnv* env) break; case BAG_STOP_BACKTRACK: - if (NODE_IS_STOP_BT_SIMPLE_REPEAT(node)) { + if (NODE_IS_STRICT_REAL_REPEAT(node)) { QuantNode* qn = QUANT_(NODE_BAG_BODY(node)); r = compile_tree_n_times(NODE_QUANT_BODY(qn), qn->lower, reg, env); if (r != 0) return r; @@ -1433,7 +1574,7 @@ compile_bag_node(BagNode* node, regex_t* reg, ScanEnv* env) r = add_op(reg, OP_PUSH); if (r != 0) return r; - COP(reg)->push.addr = SIZE_INC_OP + len + SIZE_OP_POP_OUT + SIZE_OP_JUMP; + COP(reg)->push.addr = SIZE_INC + len + OPSIZE_POP_OUT + OPSIZE_JUMP; r = compile_tree(NODE_QUANT_BODY(qn), reg, env); if (r != 0) return r; @@ -1442,7 +1583,7 @@ compile_bag_node(BagNode* node, regex_t* reg, ScanEnv* env) r = add_op(reg, OP_JUMP); if (r != 0) return r; - COP(reg)->jump.addr = -((int )SIZE_OP_PUSH + len + (int )SIZE_OP_POP_OUT); + COP(reg)->jump.addr = -((int )OPSIZE_PUSH + len + (int )OPSIZE_POP_OUT); } else { r = add_op(reg, OP_ATOMIC_START); @@ -1455,7 +1596,7 @@ compile_bag_node(BagNode* node, regex_t* reg, ScanEnv* env) case BAG_IF_ELSE: { - int cond_len, then_len, jump_len; + int cond_len, then_len, else_len, jump_len; Node* cond = NODE_BAG_BODY(node); Node* Then = node->te.Then; Node* Else = node->te.Else; @@ -1472,12 +1613,11 @@ compile_bag_node(BagNode* node, regex_t* reg, ScanEnv* env) else then_len = 0; - jump_len = cond_len + then_len + SIZE_OP_ATOMIC_END; - if (IS_NOT_NULL(Else)) jump_len += SIZE_OP_JUMP; + jump_len = cond_len + then_len + OPSIZE_ATOMIC_END + OPSIZE_JUMP; r = add_op(reg, OP_PUSH); if (r != 0) return r; - COP(reg)->push.addr = SIZE_INC_OP + jump_len; + COP(reg)->push.addr = SIZE_INC + jump_len; r = compile_tree(cond, reg, env); if (r != 0) return r; @@ -1490,11 +1630,20 @@ compile_bag_node(BagNode* node, regex_t* reg, ScanEnv* env) } if (IS_NOT_NULL(Else)) { - int else_len = compile_length_tree(Else, reg); - r = add_op(reg, OP_JUMP); - if (r != 0) return r; - COP(reg)->jump.addr = else_len + SIZE_INC_OP; + else_len = compile_length_tree(Else, reg); + if (else_len < 0) return else_len; + } + else + else_len = 0; + + r = add_op(reg, OP_JUMP); + if (r != 0) return r; + COP(reg)->jump.addr = OPSIZE_ATOMIC_END + else_len + SIZE_INC; + r = add_op(reg, OP_ATOMIC_END); + if (r != 0) return r; + + if (IS_NOT_NULL(Else)) { r = compile_tree(Else, reg, env); } } @@ -1517,16 +1666,16 @@ compile_length_anchor_node(AnchorNode* node, regex_t* reg) switch (node->type) { case ANCR_PREC_READ: - len = SIZE_OP_PREC_READ_START + tlen + SIZE_OP_PREC_READ_END; + len = OPSIZE_PREC_READ_START + tlen + OPSIZE_PREC_READ_END; break; case ANCR_PREC_READ_NOT: - len = SIZE_OP_PREC_READ_NOT_START + tlen + SIZE_OP_PREC_READ_NOT_END; + len = OPSIZE_PREC_READ_NOT_START + tlen + OPSIZE_PREC_READ_NOT_END; break; case ANCR_LOOK_BEHIND: - len = SIZE_OP_LOOK_BEHIND + tlen; + len = OPSIZE_LOOK_BEHIND + tlen; break; case ANCR_LOOK_BEHIND_NOT: - len = SIZE_OP_LOOK_BEHIND_NOT_START + tlen + SIZE_OP_LOOK_BEHIND_NOT_END; + len = OPSIZE_LOOK_BEHIND_NOT_START + tlen + OPSIZE_LOOK_BEHIND_NOT_END; break; case ANCR_WORD_BOUNDARY: @@ -1535,7 +1684,7 @@ compile_length_anchor_node(AnchorNode* node, regex_t* reg) case ANCR_WORD_BEGIN: case ANCR_WORD_END: #endif - len = SIZE_OP_WORD_BOUNDARY; + len = OPSIZE_WORD_BOUNDARY; break; case ANCR_TEXT_SEGMENT_BOUNDARY: @@ -1619,7 +1768,7 @@ compile_anchor_node(AnchorNode* node, regex_t* reg, ScanEnv* env) r = add_op(reg, OP_PREC_READ_NOT_START); if (r != 0) return r; - COP(reg)->prec_read_not_start.addr = SIZE_INC_OP + len + SIZE_OP_PREC_READ_NOT_END; + COP(reg)->prec_read_not_start.addr = SIZE_INC + len + OPSIZE_PREC_READ_NOT_END; r = compile_tree(NODE_ANCHOR_BODY(node), reg, env); if (r != 0) return r; r = add_op(reg, OP_PREC_READ_NOT_END); @@ -1649,7 +1798,7 @@ compile_anchor_node(AnchorNode* node, regex_t* reg, ScanEnv* env) len = compile_length_tree(NODE_ANCHOR_BODY(node), reg); r = add_op(reg, OP_LOOK_BEHIND_NOT_START); if (r != 0) return r; - COP(reg)->look_behind_not_start.addr = SIZE_INC_OP + len + SIZE_OP_LOOK_BEHIND_NOT_END; + COP(reg)->look_behind_not_start.addr = SIZE_INC + len + OPSIZE_LOOK_BEHIND_NOT_END; if (node->char_len < 0) { r = get_char_len_node(NODE_ANCHOR_BODY(node), reg, &n); @@ -1735,25 +1884,25 @@ compile_length_gimmick_node(GimmickNode* node, regex_t* reg) switch (node->type) { case GIMMICK_FAIL: - len = SIZE_OP_FAIL; + len = OPSIZE_FAIL; break; case GIMMICK_SAVE: - len = SIZE_OP_PUSH_SAVE_VAL; + len = OPSIZE_PUSH_SAVE_VAL; break; case GIMMICK_UPDATE_VAR: - len = SIZE_OP_UPDATE_VAR; + len = OPSIZE_UPDATE_VAR; break; #ifdef USE_CALLOUT case GIMMICK_CALLOUT: switch (node->detail_type) { case ONIG_CALLOUT_OF_CONTENTS: - len = SIZE_OP_CALLOUT_CONTENTS; + len = OPSIZE_CALLOUT_CONTENTS; break; case ONIG_CALLOUT_OF_NAME: - len = SIZE_OP_CALLOUT_NAME; + len = OPSIZE_CALLOUT_NAME; break; default: @@ -1792,13 +1941,13 @@ compile_length_tree(Node* node, regex_t* reg) r += compile_length_tree(NODE_CAR(node), reg); n++; } while (IS_NOT_NULL(node = NODE_CDR(node))); - r += (SIZE_OP_PUSH + SIZE_OP_JUMP) * (n - 1); + r += (OPSIZE_PUSH + OPSIZE_JUMP) * (n - 1); } break; case NODE_STRING: - if (NODE_STRING_IS_RAW(node)) - r = compile_length_string_raw_node(STR_(node), reg); + if (NODE_STRING_IS_CRUDE(node)) + r = compile_length_string_crude_node(STR_(node), reg); else r = compile_length_string_node(node, reg); break; @@ -1812,12 +1961,12 @@ compile_length_tree(Node* node, regex_t* reg) break; case NODE_BACKREF: - r = SIZE_OP_BACKREF; + r = OPSIZE_BACKREF; break; #ifdef USE_CALL case NODE_CALL: - r = SIZE_OP_CALL; + r = OPSIZE_CALL; break; #endif @@ -1864,7 +2013,7 @@ compile_tree(Node* node, regex_t* reg, ScanEnv* env) do { len += compile_length_tree(NODE_CAR(x), reg); if (IS_NOT_NULL(NODE_CDR(x))) { - len += SIZE_OP_PUSH + SIZE_OP_JUMP; + len += OPSIZE_PUSH + OPSIZE_JUMP; } } while (IS_NOT_NULL(x = NODE_CDR(x))); pos = COP_CURR_OFFSET(reg) + 1 + len; /* goal position */ @@ -1875,7 +2024,7 @@ compile_tree(Node* node, regex_t* reg, ScanEnv* env) enum OpCode push = NODE_IS_SUPER(node) ? OP_PUSH_SUPER : OP_PUSH; r = add_op(reg, push); if (r != 0) break; - COP(reg)->push.addr = SIZE_INC_OP + len + SIZE_OP_JUMP; + COP(reg)->push.addr = SIZE_INC + len + OPSIZE_JUMP; } r = compile_tree(NODE_CAR(node), reg, env); if (r != 0) break; @@ -1890,8 +2039,8 @@ compile_tree(Node* node, regex_t* reg, ScanEnv* env) break; case NODE_STRING: - if (NODE_STRING_IS_RAW(node)) - r = compile_string_raw_node(STR_(node), reg); + if (NODE_STRING_IS_CRUDE(node)) + r = compile_string_crude_node(STR_(node), reg); else r = compile_string_node(node, reg); break; @@ -2061,8 +2210,9 @@ noname_disable_map(Node** plink, GroupNumRemap* map, int* counter) Node** ptarget = &(NODE_BODY(node)); Node* old = *ptarget; r = noname_disable_map(ptarget, map, counter); + if (r != 0) return r; if (*ptarget != old && NODE_TYPE(*ptarget) == NODE_QUANT) { - onig_reduce_nested_quantifier(node, *ptarget); + r = onig_reduce_nested_quantifier(node); } } break; @@ -2274,11 +2424,11 @@ disable_noname_group_capture(Node** root, regex_t* reg, ScanEnv* env) } } - loc = env->capture_history; - MEM_STATUS_CLEAR(env->capture_history); + loc = env->cap_history; + MEM_STATUS_CLEAR(env->cap_history); for (i = 1; i <= ONIG_MAX_CAPTURE_HISTORY_GROUP; i++) { if (MEM_STATUS_AT(loc, i)) { - MEM_STATUS_ON_SIMPLE(env->capture_history, map[i].new_val); + MEM_STATUS_ON_SIMPLE(env->cap_history, map[i].new_val); } } @@ -2654,7 +2804,7 @@ is_exclusive(Node* x, Node* y, regex_t* reg) len = NODE_STRING_LEN(x); if (len > NODE_STRING_LEN(y)) len = NODE_STRING_LEN(y); - if (NODE_STRING_IS_AMBIG(x) || NODE_STRING_IS_AMBIG(y)) { + if (NODE_STRING_IS_CASE_FOLD_MATCH(x) || NODE_STRING_IS_CASE_FOLD_MATCH(y)) { /* tiny version */ return 0; } @@ -2714,7 +2864,7 @@ get_head_value_node(Node* node, int exact, regex_t* reg) break; if (exact == 0 || - ! IS_IGNORECASE(reg->options) || NODE_STRING_IS_RAW(node)) { + ! IS_IGNORECASE(reg->options) || NODE_STRING_IS_CRUDE(node)) { n = node; } } @@ -2842,9 +2992,9 @@ tree_min_len(Node* node, ScanEnv* env) if (NODE_IS_RECURSION(node)) break; backs = BACKREFS_P(br); - len = tree_min_len(mem_env[backs[0]].node, env); + len = tree_min_len(mem_env[backs[0]].mem_node, env); for (i = 1; i < br->back_num; i++) { - tmin = tree_min_len(mem_env[backs[i]].node, env); + tmin = tree_min_len(mem_env[backs[i]].mem_node, env); if (len > tmin) len = tmin; } } @@ -3013,7 +3163,7 @@ tree_max_len(Node* node, ScanEnv* env) } backs = BACKREFS_P(br); for (i = 0; i < br->back_num; i++) { - tmax = tree_max_len(mem_env[backs[i]].node, env); + tmax = tree_max_len(mem_env[backs[i]].mem_node, env); if (len < tmax) len = tmax; } } @@ -3035,7 +3185,7 @@ tree_max_len(Node* node, ScanEnv* env) if (qn->upper != 0) { len = tree_max_len(NODE_BODY(node), env); if (len != 0) { - if (! IS_REPEAT_INFINITE(qn->upper)) + if (! IS_INFINITE_REPEAT(qn->upper)) len = distance_multiply(len, qn->upper); else len = INFINITE_LEN; @@ -3150,7 +3300,7 @@ check_backrefs(Node* node, ScanEnv* env) if (backs[i] > env->num_mem) return ONIGERR_INVALID_BACKREF; - NODE_STATUS_ADD(mem_env[backs[i]].node, BACKREF); + NODE_STATUS_ADD(mem_env[backs[i]].mem_node, BACKREF); } r = 0; } @@ -3164,6 +3314,204 @@ check_backrefs(Node* node, ScanEnv* env) return r; } +static int +set_empty_repeat_node_trav(Node* node, Node* empty, ScanEnv* env) +{ + int r; + + switch (NODE_TYPE(node)) { + case NODE_LIST: + case NODE_ALT: + do { + r = set_empty_repeat_node_trav(NODE_CAR(node), empty, env); + } while (r == 0 && IS_NOT_NULL(node = NODE_CDR(node))); + break; + + case NODE_ANCHOR: + { + AnchorNode* an = ANCHOR_(node); + + if (! ANCHOR_HAS_BODY(an)) { + r = 0; + break; + } + + switch (an->type) { + case ANCR_PREC_READ: + case ANCR_LOOK_BEHIND: + empty = NULL_NODE; + break; + default: + break; + } + r = set_empty_repeat_node_trav(NODE_BODY(node), empty, env); + } + break; + + case NODE_QUANT: + { + QuantNode* qn = QUANT_(node); + + if (qn->emptiness != BODY_IS_NOT_EMPTY) empty = node; + r = set_empty_repeat_node_trav(NODE_BODY(node), empty, env); + } + break; + + case NODE_BAG: + if (IS_NOT_NULL(NODE_BODY(node))) { + r = set_empty_repeat_node_trav(NODE_BODY(node), empty, env); + if (r != 0) return r; + } + { + BagNode* en = BAG_(node); + + if (en->type == BAG_MEMORY) { + if (NODE_IS_BACKREF(node)) { + if (IS_NOT_NULL(empty)) + SCANENV_MEMENV(env)[en->m.regnum].empty_repeat_node = empty; + } + } + else if (en->type == BAG_IF_ELSE) { + if (IS_NOT_NULL(en->te.Then)) { + r = set_empty_repeat_node_trav(en->te.Then, empty, env); + if (r != 0) return r; + } + if (IS_NOT_NULL(en->te.Else)) { + r = set_empty_repeat_node_trav(en->te.Else, empty, env); + } + } + } + break; + + default: + r = 0; + break; + } + + return r; +} + +static int +is_ancestor_node(Node* node, Node* me) +{ + Node* parent; + + while ((parent = NODE_PARENT(me)) != NULL_NODE) { + if (parent == node) return 1; + me = parent; + } + return 0; +} + +static void +set_empty_status_check_trav(Node* node, ScanEnv* env) +{ + switch (NODE_TYPE(node)) { + case NODE_LIST: + case NODE_ALT: + do { + set_empty_status_check_trav(NODE_CAR(node), env); + } while (IS_NOT_NULL(node = NODE_CDR(node))); + break; + + case NODE_ANCHOR: + { + AnchorNode* an = ANCHOR_(node); + + if (! ANCHOR_HAS_BODY(an)) break; + set_empty_status_check_trav(NODE_BODY(node), env); + } + break; + + case NODE_QUANT: + set_empty_status_check_trav(NODE_BODY(node), env); + break; + + case NODE_BAG: + if (IS_NOT_NULL(NODE_BODY(node))) + set_empty_status_check_trav(NODE_BODY(node), env); + { + BagNode* en = BAG_(node); + + if (en->type == BAG_IF_ELSE) { + if (IS_NOT_NULL(en->te.Then)) { + set_empty_status_check_trav(en->te.Then, env); + } + if (IS_NOT_NULL(en->te.Else)) { + set_empty_status_check_trav(en->te.Else, env); + } + } + } + break; + + case NODE_BACKREF: + { + int i; + int* backs; + MemEnv* mem_env = SCANENV_MEMENV(env); + BackRefNode* br = BACKREF_(node); + backs = BACKREFS_P(br); + for (i = 0; i < br->back_num; i++) { + Node* ernode = mem_env[backs[i]].empty_repeat_node; + if (IS_NOT_NULL(ernode)) { + if (! is_ancestor_node(ernode, node)) { + MEM_STATUS_LIMIT_ON(env->reg->empty_status_mem, backs[i]); + NODE_STATUS_ADD(ernode, EMPTY_STATUS_CHECK); + NODE_STATUS_ADD(mem_env[backs[i]].mem_node, EMPTY_STATUS_CHECK); + } + } + } + } + break; + + default: + break; + } +} + +static void +set_parent_node_trav(Node* node, Node* parent) +{ + NODE_PARENT(node) = parent; + + switch (NODE_TYPE(node)) { + case NODE_LIST: + case NODE_ALT: + do { + set_parent_node_trav(NODE_CAR(node), node); + } while (IS_NOT_NULL(node = NODE_CDR(node))); + break; + + case NODE_ANCHOR: + if (! ANCHOR_HAS_BODY(ANCHOR_(node))) break; + set_parent_node_trav(NODE_BODY(node), node); + break; + + case NODE_QUANT: + set_parent_node_trav(NODE_BODY(node), node); + break; + + case NODE_BAG: + if (IS_NOT_NULL(NODE_BODY(node))) + set_parent_node_trav(NODE_BODY(node), node); + { + BagNode* en = BAG_(node); + + if (en->type == BAG_IF_ELSE) { + if (IS_NOT_NULL(en->te.Then)) + set_parent_node_trav(en->te.Then, node); + if (IS_NOT_NULL(en->te.Else)) { + set_parent_node_trav(en->te.Else, node); + } + } + } + break; + + default: + break; + } +} + #ifdef USE_CALL @@ -3269,6 +3617,9 @@ infinite_recursive_call_check(Node* node, ScanEnv* env, int head) if ((eret & RECURSION_MUST) == 0) r &= ~RECURSION_MUST; } + else { + r &= ~RECURSION_MUST; + } } else { r = infinite_recursive_call_check(NODE_BODY(node), env, head); @@ -3443,7 +3794,7 @@ recursive_call_check_trav(Node* node, ScanEnv* env, int state) r = recursive_call_check_trav(NODE_BODY(node), env, state); if (QUANT_(node)->upper == 0) { if (r == FOUND_CALLED_NODE) - QUANT_(node)->is_refered = 1; + QUANT_(node)->include_referred = 1; } break; @@ -3466,8 +3817,10 @@ recursive_call_check_trav(Node* node, ScanEnv* env, int state) if (! NODE_IS_RECURSION(node)) { NODE_STATUS_ADD(node, MARK1); r = recursive_call_check(NODE_BODY(node)); - if (r != 0) + if (r != 0) { NODE_STATUS_ADD(node, RECURSION); + MEM_STATUS_ON(env->backtrack_mem, en->m.regnum); + } NODE_STATUS_REMOVE(node, MARK1); } @@ -3508,6 +3861,96 @@ recursive_call_check_trav(Node* node, ScanEnv* env, int state) #endif +static void +remove_from_list(Node* prev, Node* a) +{ + if (NODE_CDR(prev) != a) return ; + + NODE_CDR(prev) = NODE_CDR(a); + NODE_CDR(a) = NULL_NODE; +} + +static int +reduce_string_list(Node* node) +{ + int r = 0; + + switch (NODE_TYPE(node)) { + case NODE_LIST: + { + Node* prev; + Node* curr; + Node* prev_node; + Node* next_node; + + prev = NULL_NODE; + do { + next_node = NODE_CDR(node); + curr = NODE_CAR(node); + if (NODE_TYPE(curr) == NODE_STRING) { + if (IS_NULL(prev) || STR_(curr)->flag != STR_(prev)->flag) { + prev = curr; + prev_node = node; + } + else { + r = node_str_node_cat(prev, curr); + if (r != 0) return r; + remove_from_list(prev_node, node); + onig_node_free(node); + } + } + else { + prev = NULL_NODE; + prev_node = node; + } + + node = next_node; + } while (r == 0 && IS_NOT_NULL(node)); + } + break; + + case NODE_ALT: + do { + r = reduce_string_list(NODE_CAR(node)); + } while (r == 0 && IS_NOT_NULL(node = NODE_CDR(node))); + break; + + case NODE_ANCHOR: + if (IS_NULL(NODE_BODY(node))) + break; + /* fall */ + case NODE_QUANT: + r = reduce_string_list(NODE_BODY(node)); + break; + + case NODE_BAG: + { + BagNode* en = BAG_(node); + + r = reduce_string_list(NODE_BODY(node)); + if (r != 0) return r; + + if (en->type == BAG_IF_ELSE) { + if (IS_NOT_NULL(en->te.Then)) { + r = reduce_string_list(en->te.Then); + if (r != 0) return r; + } + if (IS_NOT_NULL(en->te.Else)) { + r = reduce_string_list(en->te.Else); + if (r != 0) return r; + } + } + } + break; + + default: + break; + } + + return r; +} + + #define IN_ALT (1<<0) #define IN_NOT (1<<1) #define IN_REAL_REPEAT (1<<2) @@ -3530,7 +3973,7 @@ divide_look_behind_alternatives(Node* node) head = NODE_ANCHOR_BODY(an); np = NODE_CAR(head); - swap_node(node, head); + node_swap(node, head); NODE_CAR(node) = head; NODE_BODY(head) = np; @@ -3552,7 +3995,7 @@ divide_look_behind_alternatives(Node* node) } static int -setup_look_behind(Node* node, regex_t* reg, ScanEnv* env) +tune_look_behind(Node* node, regex_t* reg, ScanEnv* env) { int r, len; AnchorNode* an = ANCHOR_(node); @@ -3573,7 +4016,7 @@ setup_look_behind(Node* node, regex_t* reg, ScanEnv* env) } static int -next_setup(Node* node, Node* next_node, regex_t* reg) +tune_next(Node* node, Node* next_node, regex_t* reg) { NodeType type; @@ -3581,7 +4024,7 @@ next_setup(Node* node, Node* next_node, regex_t* reg) type = NODE_TYPE(node); if (type == NODE_QUANT) { QuantNode* qn = QUANT_(node); - if (qn->greedy && IS_REPEAT_INFINITE(qn->upper)) { + if (qn->greedy && IS_INFINITE_REPEAT(qn->upper)) { #ifdef USE_QUANT_PEEK_NEXT Node* n = get_head_value_node(next_node, 1, reg); /* '\0': for UTF-16BE etc... */ @@ -3591,7 +4034,7 @@ next_setup(Node* node, Node* next_node, regex_t* reg) #endif /* automatic posseivation a*b ==> (?>a*)b */ if (qn->lower <= 1) { - if (NODE_IS_SIMPLE_TYPE(NODE_BODY(node))) { + if (is_strict_real_node(NODE_BODY(node))) { Node *x, *y; x = get_head_value_node(NODE_BODY(node), 0, reg); if (IS_NOT_NULL(x)) { @@ -3599,8 +4042,8 @@ next_setup(Node* node, Node* next_node, regex_t* reg) if (IS_NOT_NULL(y) && is_exclusive(x, y, reg)) { Node* en = onig_node_new_bag(BAG_STOP_BACKTRACK); CHECK_NULL_RETURN_MEMERR(en); - NODE_STATUS_ADD(en, STOP_BT_SIMPLE_REPEAT); - swap_node(node, en); + NODE_STATUS_ADD(en, STRICT_REAL_REPEAT); + node_swap(node, en); NODE_BODY(node) = en; } } @@ -3620,23 +4063,57 @@ next_setup(Node* node, Node* next_node, regex_t* reg) static int -update_string_node_case_fold(regex_t* reg, Node *node) +is_all_code_len_1_items(int n, OnigCaseFoldCodeItem items[]) { - UChar *p, *end, buf[ONIGENC_MBC_CASE_FOLD_MAXLEN]; + int i; + + for (i = 0; i < n; i++) { + OnigCaseFoldCodeItem* item = items + i; + if (item->code_len != 1) return 0; + } + + return 1; +} + +static int +get_min_max_byte_len_case_fold_items(int n, OnigCaseFoldCodeItem items[], int* rmin, int* rmax) +{ + int i, len, minlen, maxlen; + + minlen = INT_MAX; + maxlen = 0; + for (i = 0; i < n; i++) { + OnigCaseFoldCodeItem* item = items + i; + + len = item->byte_len; + if (len < minlen) minlen = len; + if (len > maxlen) maxlen = len; + } + + *rmin = minlen; + *rmax = maxlen; + return 0; +} + +static int +conv_string_case_fold(OnigEncoding enc, OnigCaseFoldType case_fold_flag, + UChar* s, UChar* end, UChar** rs, UChar** rend, int* rcase_min_len) +{ + UChar *p, buf[ONIGENC_MBC_CASE_FOLD_MAXLEN]; UChar *sbuf, *ebuf, *sp; - int r, i, len, sbuf_size; - StrNode* sn = STR_(node); + int i, n, len, sbuf_size; - end = sn->end; - sbuf_size = (int )(end - sn->s) * 2; + *rs = NULL; + sbuf_size = (int )(end - s) * 2; sbuf = (UChar* )xmalloc(sbuf_size); CHECK_NULL_RETURN_MEMERR(sbuf); ebuf = sbuf + sbuf_size; + n = 0; sp = sbuf; - p = sn->s; + p = s; while (p < end) { - len = ONIGENC_MBC_CASE_FOLD(reg->enc, reg->case_fold_flag, &p, end, buf); + len = ONIGENC_MBC_CASE_FOLD(enc, case_fold_flag, &p, end, buf); for (i = 0; i < len; i++) { if (sp >= ebuf) { sbuf = (UChar* )xrealloc(sbuf, sbuf_size * 2); @@ -3648,364 +4125,310 @@ update_string_node_case_fold(regex_t* reg, Node *node) *sp++ = buf[i]; } + n++; } - r = onig_node_str_set(node, sbuf, sp); - if (r != 0) { - xfree(sbuf); - return r; - } - - xfree(sbuf); + *rs = sbuf; + *rend = sp; + *rcase_min_len = n; return 0; } static int -expand_case_fold_make_rem_string(Node** rnode, UChar *s, UChar *end, regex_t* reg) +make_code_list_to_string(Node** rnode, OnigEncoding enc, + int n, OnigCodePoint codes[]) { - int r; - Node *node; + int r, i, len; + Node* node; + UChar buf[ONIGENC_CODE_TO_MBC_MAXLEN]; - node = onig_node_new_str(s, end); - if (IS_NULL(node)) return ONIGERR_MEMORY; + *rnode = NULL_NODE; + node = onig_node_new_str(NULL, NULL); + CHECK_NULL_RETURN_MEMERR(node); - r = update_string_node_case_fold(reg, node); - if (r != 0) { - onig_node_free(node); - return r; + for (i = 0; i < n; i++) { + len = ONIGENC_CODE_TO_MBC(enc, codes[i], buf); + if (len < 0) { + r = len; + goto err; + } + + r = onig_node_str_cat(node, buf, buf + len); + if (r != 0) goto err; } - NODE_STRING_SET_AMBIG(node); - NODE_STRING_SET_DONT_GET_OPT_INFO(node); *rnode = node; return 0; + + err: + onig_node_free(node); + return r; } static int -expand_case_fold_string_alt(int item_num, OnigCaseFoldCodeItem items[], UChar *p, - int slen, UChar *end, regex_t* reg, Node **rnode) +unravel_cf_node_add(Node** rlist, Node* add) { - int r, i, j; - int len; - int varlen; - Node *anode, *var_anode, *snode, *xnode, *an; - UChar buf[ONIGENC_CODE_TO_MBC_MAXLEN]; + Node *list; - *rnode = var_anode = NULL_NODE; - - varlen = 0; - for (i = 0; i < item_num; i++) { - if (items[i].byte_len != slen) { - varlen = 1; - break; - } + list = *rlist; + if (IS_NULL(list)) { + list = onig_node_new_list(add, NULL); + CHECK_NULL_RETURN_MEMERR(list); + *rlist = list; + } + else { + Node* r = node_list_add(list, add); + CHECK_NULL_RETURN_MEMERR(r); } - if (varlen != 0) { - *rnode = var_anode = onig_node_new_alt(NULL_NODE, NULL_NODE); - if (IS_NULL(var_anode)) return ONIGERR_MEMORY; + return 0; +} + +static int +unravel_cf_string_add(Node** rlist, Node** rsn, UChar* s, UChar* end, + unsigned int flag, int case_min_len) +{ + int r; + Node *sn, *list; - xnode = onig_node_new_list(NULL, NULL); - if (IS_NULL(xnode)) goto mem_err; - NODE_CAR(var_anode) = xnode; + list = *rlist; + sn = *rsn; - anode = onig_node_new_alt(NULL_NODE, NULL_NODE); - if (IS_NULL(anode)) goto mem_err; - NODE_CAR(xnode) = anode; + if (IS_NOT_NULL(sn) && STR_(sn)->flag == flag) { + if (NODE_STRING_IS_CASE_FOLD_MATCH(sn)) + r = node_str_cat_case_fold(sn, s, end, case_min_len); + else + r = onig_node_str_cat(sn, s, end); } else { - *rnode = anode = onig_node_new_alt(NULL_NODE, NULL_NODE); - if (IS_NULL(anode)) return ONIGERR_MEMORY; + sn = onig_node_new_str(s, end); + CHECK_NULL_RETURN_MEMERR(sn); + + STR_(sn)->flag = flag; + STR_(sn)->case_min_len = case_min_len; + r = unravel_cf_node_add(&list, sn); } - snode = onig_node_new_str(p, p + slen); - if (IS_NULL(snode)) goto mem_err; + if (r == 0) { + *rlist = list; + *rsn = sn; + } + return r; +} - NODE_CAR(anode) = snode; +static int +unravel_cf_string_fold_add(Node** rlist, Node** rsn, OnigEncoding enc, + OnigCaseFoldType case_fold_flag, UChar* s, UChar* end) +{ + int r; + int case_min_len; + UChar *rs, *rend; - for (i = 0; i < item_num; i++) { - snode = onig_node_new_str(NULL, NULL); - if (IS_NULL(snode)) goto mem_err; + r = conv_string_case_fold(enc, case_fold_flag, s, end, + &rs, &rend, &case_min_len); + if (r != 0) return r; - for (j = 0; j < items[i].code_len; j++) { - len = ONIGENC_CODE_TO_MBC(reg->enc, items[i].code[j], buf); - if (len < 0) { - r = len; - goto mem_err2; - } + r = unravel_cf_string_add(rlist, rsn, rs, rend, + NODE_STRING_CASE_FOLD_MATCH, case_min_len); + xfree(rs); - r = onig_node_str_cat(snode, buf, buf + len); - if (r != 0) goto mem_err2; - } + return r; +} - an = onig_node_new_alt(NULL_NODE, NULL_NODE); - if (IS_NULL(an)) { - goto mem_err2; - } +static int +unravel_cf_string_alt_or_cc_add(Node** rlist, int n, + OnigCaseFoldCodeItem items[], int byte_len, OnigEncoding enc, + OnigCaseFoldType case_fold_flag, UChar* s, UChar* end) +{ + int r, i; + Node* node; - if (items[i].byte_len != slen && IS_NOT_NULL(var_anode)) { - Node *rem; - UChar *q = p + items[i].byte_len; + if (is_all_code_len_1_items(n, items)) { + OnigCodePoint codes[14];/* least ONIGENC_GET_CASE_FOLD_CODES_MAX_NUM + 1 */ - if (q < end) { - r = expand_case_fold_make_rem_string(&rem, q, end, reg); - if (r != 0) { - onig_node_free(an); - goto mem_err2; - } + codes[0] = ONIGENC_MBC_TO_CODE(enc, s, end); + for (i = 0; i < n; i++) { + OnigCaseFoldCodeItem* item = items + i; + codes[i+1] = item->code[0]; + } + r = onig_new_cclass_with_code_list(&node, enc, n + 1, codes); + if (r != 0) return r; + } + else { + Node *snode, *alt, *curr; - xnode = onig_node_list_add(NULL_NODE, snode); - if (IS_NULL(xnode)) { - onig_node_free(an); - onig_node_free(rem); - goto mem_err2; - } - if (IS_NULL(onig_node_list_add(xnode, rem))) { - onig_node_free(an); - onig_node_free(xnode); - onig_node_free(rem); - goto mem_err; - } + snode = onig_node_new_str(s, end); + CHECK_NULL_RETURN_MEMERR(snode); + node = curr = onig_node_new_alt(snode, NULL_NODE); + if (IS_NULL(curr)) { + onig_node_free(snode); + return ONIGERR_MEMORY; + } - NODE_CAR(an) = xnode; + r = 0; + for (i = 0; i < n; i++) { + OnigCaseFoldCodeItem* item = items + i; + r = make_code_list_to_string(&snode, enc, item->code_len, item->code); + if (r != 0) { + onig_node_free(node); + return r; } - else { - NODE_CAR(an) = snode; + + alt = onig_node_new_alt(snode, NULL_NODE); + if (IS_NULL(alt)) { + onig_node_free(snode); + onig_node_free(node); + return ONIGERR_MEMORY; } - NODE_CDR(var_anode) = an; - var_anode = an; - } - else { - NODE_CAR(an) = snode; - NODE_CDR(anode) = an; - anode = an; + NODE_CDR(curr) = alt; + curr = alt; } } - return varlen; - - mem_err2: - onig_node_free(snode); - - mem_err: - onig_node_free(*rnode); - - return ONIGERR_MEMORY; + r = unravel_cf_node_add(rlist, node); + if (r != 0) onig_node_free(node); + return r; } static int -is_good_case_fold_items_for_search(OnigEncoding enc, int slen, - int n, OnigCaseFoldCodeItem items[]) +unravel_cf_look_behind_add(Node** rlist, Node** rsn, + int n, OnigCaseFoldCodeItem items[], OnigEncoding enc, + UChar* s, int one_len) { - int i, len; - UChar buf[ONIGENC_MBC_CASE_FOLD_MAXLEN]; + int r, i, found; + found = 0; for (i = 0; i < n; i++) { OnigCaseFoldCodeItem* item = items + i; + if (item->byte_len == one_len) { + if (item->code_len == 1) { + found = 1; + } + } + } - if (item->code_len != 1) return 0; - if (item->byte_len != slen) return 0; - len = ONIGENC_CODE_TO_MBC(enc, item->code[0], buf); - if (len != slen) return 0; + if (found == 0) { + r = unravel_cf_string_add(rlist, rsn, s, s + one_len, 0 /* flag */, 0); } + else { + Node* node; + OnigCodePoint codes[14];/* least ONIGENC_GET_CASE_FOLD_CODES_MAX_NUM + 1 */ - return 1; -} + found = 0; + codes[found++] = ONIGENC_MBC_TO_CODE(enc, s, s + one_len); + for (i = 0; i < n; i++) { + OnigCaseFoldCodeItem* item = items + i; + if (item->byte_len == one_len) { + if (item->code_len == 1) { + codes[found++] = item->code[0]; + } + } + } + r = onig_new_cclass_with_code_list(&node, enc, found, codes); + if (r != 0) return r; + + r = unravel_cf_node_add(rlist, node); + if (r != 0) onig_node_free(node); + + *rsn = NULL_NODE; + } -#define THRESHOLD_CASE_FOLD_ALT_FOR_EXPANSION 8 + return r; +} static int -expand_case_fold_string(Node* node, regex_t* reg, int state) -{ - int r, n, len, alt_num; - int fold_len; - int prev_is_ambig, prev_is_good, is_good, is_in_look_behind; - UChar *start, *end, *p; - UChar* foldp; - Node *top_root, *root, *snode, *prev_node; +unravel_case_fold_string(Node* node, regex_t* reg, int state) +{ + int r, n, one_len, min_len, max_len, in_look_behind; + UChar *start, *end, *p, *q; + StrNode* snode; + Node *sn, *list; + OnigEncoding enc; OnigCaseFoldCodeItem items[ONIGENC_GET_CASE_FOLD_CODES_MAX_NUM]; - UChar buf[ONIGENC_MBC_CASE_FOLD_MAXLEN]; - StrNode* sn; - if (NODE_STRING_IS_AMBIG(node)) return 0; + if (NODE_STRING_IS_CASE_EXPANDED(node)) return 0; - sn = STR_(node); + snode = STR_(node); - start = sn->s; - end = sn->end; + start = snode->s; + end = snode->end; if (start >= end) return 0; - is_in_look_behind = (state & IN_LOOK_BEHIND) != 0; + in_look_behind = (state & IN_LOOK_BEHIND) != 0; + enc = reg->enc; - r = 0; - top_root = root = prev_node = snode = NULL_NODE; - alt_num = 1; + list = sn = NULL_NODE; p = start; while (p < end) { - n = ONIGENC_GET_CASE_FOLD_CODES_BY_STR(reg->enc, reg->case_fold_flag, - p, end, items); + n = ONIGENC_GET_CASE_FOLD_CODES_BY_STR(enc, reg->case_fold_flag, p, end, + items); if (n < 0) { r = n; goto err; } - len = enclen(reg->enc, p); - is_good = is_good_case_fold_items_for_search(reg->enc, len, n, items); - - if (is_in_look_behind || - (IS_NOT_NULL(snode) || - (is_good - /* expand single char case: ex. /(?i:a)/ */ - && !(p == start && p + len >= end)))) { - if (IS_NULL(snode)) { - if (IS_NULL(root) && IS_NOT_NULL(prev_node)) { - top_root = root = onig_node_list_add(NULL_NODE, prev_node); - if (IS_NULL(root)) { - onig_node_free(prev_node); - goto mem_err; - } - } - - prev_node = snode = onig_node_new_str(NULL, NULL); - if (IS_NULL(snode)) goto mem_err; - if (IS_NOT_NULL(root)) { - if (IS_NULL(onig_node_list_add(root, snode))) { - onig_node_free(snode); - goto mem_err; - } - } - - prev_is_ambig = -1; /* -1: new */ - prev_is_good = 0; /* escape compiler warning */ - } - else { - prev_is_ambig = NODE_STRING_IS_AMBIG(snode); - prev_is_good = NODE_STRING_IS_GOOD_AMBIG(snode); - } - - if (n != 0) { - foldp = p; - fold_len = ONIGENC_MBC_CASE_FOLD(reg->enc, reg->case_fold_flag, - &foldp, end, buf); - foldp = buf; - } - else { - foldp = p; fold_len = len; - } - - if ((prev_is_ambig == 0 && n != 0) || - (prev_is_ambig > 0 && (n == 0 || prev_is_good != is_good))) { - if (IS_NULL(root) /* && IS_NOT_NULL(prev_node) */) { - top_root = root = onig_node_list_add(NULL_NODE, prev_node); - if (IS_NULL(root)) { - onig_node_free(prev_node); - goto mem_err; - } - } - - prev_node = snode = onig_node_new_str(foldp, foldp + fold_len); - if (IS_NULL(snode)) goto mem_err; - if (IS_NULL(onig_node_list_add(root, snode))) { - onig_node_free(snode); - goto mem_err; - } - } - else { - r = onig_node_str_cat(snode, foldp, foldp + fold_len); - if (r != 0) goto err; - } - - if (n != 0) NODE_STRING_SET_AMBIG(snode); - if (is_good != 0) NODE_STRING_SET_GOOD_AMBIG(snode); + one_len = enclen(enc, p); + if (n == 0) { + q = p + one_len; + r = unravel_cf_string_add(&list, &sn, p, q, 0 /* flag */, 0); + if (r != 0) goto err; } else { - alt_num *= (n + 1); - if (alt_num > THRESHOLD_CASE_FOLD_ALT_FOR_EXPANSION) break; - - if (IS_NULL(root) && IS_NOT_NULL(prev_node)) { - top_root = root = onig_node_list_add(NULL_NODE, prev_node); - if (IS_NULL(root)) { - onig_node_free(prev_node); - goto mem_err; - } + if (in_look_behind != 0) { + q = p + one_len; + r = unravel_cf_look_behind_add(&list, &sn, n, items, enc, p, one_len); + if (r != 0) goto err; } - - r = expand_case_fold_string_alt(n, items, p, len, end, reg, &prev_node); - if (r < 0) goto mem_err; - if (r == 1) { - if (IS_NULL(root)) { - top_root = prev_node; + else { + get_min_max_byte_len_case_fold_items(n, items, &min_len, &max_len); + q = p + max_len; + if (one_len == max_len && min_len == max_len) { + r = unravel_cf_string_alt_or_cc_add(&list, n, items, max_len, enc, + reg->case_fold_flag, p, q); + if (r != 0) goto err; + sn = NULL_NODE; } else { - if (IS_NULL(onig_node_list_add(root, prev_node))) { - onig_node_free(prev_node); - goto mem_err; - } - } - - root = NODE_CAR(prev_node); - } - else { /* r == 0 */ - if (IS_NOT_NULL(root)) { - if (IS_NULL(onig_node_list_add(root, prev_node))) { - onig_node_free(prev_node); - goto mem_err; - } + r = unravel_cf_string_fold_add(&list, &sn, enc, reg->case_fold_flag, + p, q); + if (r != 0) goto err; } } - - snode = NULL_NODE; } - p += len; + p = q; } - if (p < end) { - Node *srem; - - r = expand_case_fold_make_rem_string(&srem, p, end, reg); - if (r != 0) goto mem_err; - - if (IS_NOT_NULL(prev_node) && IS_NULL(root)) { - top_root = root = onig_node_list_add(NULL_NODE, prev_node); - if (IS_NULL(root)) { - onig_node_free(srem); - onig_node_free(prev_node); - goto mem_err; - } - } - - if (IS_NULL(root)) { - prev_node = srem; + if (IS_NOT_NULL(list)) { + if (node_list_len(list) == 1) { + node_swap(node, NODE_CAR(list)); } else { - if (IS_NULL(onig_node_list_add(root, srem))) { - onig_node_free(srem); - goto mem_err; - } + node_swap(node, list); } + onig_node_free(list); + } + else { + node_swap(node, sn); + onig_node_free(sn); } - - /* ending */ - top_root = (IS_NOT_NULL(top_root) ? top_root : prev_node); - swap_node(node, top_root); - onig_node_free(top_root); return 0; - mem_err: - r = ONIGERR_MEMORY; - err: - onig_node_free(top_root); + if (IS_NOT_NULL(list)) + onig_node_free(list); + else if (IS_NOT_NULL(sn)) + onig_node_free(sn); + return r; } -#ifdef USE_INSISTENT_CHECK_CAPTURES_IN_EMPTY_REPEAT -static enum BodyEmpty +#ifdef USE_STUBBORN_CHECK_CAPTURES_IN_EMPTY_REPEAT +static enum BodyEmptyType quantifiers_memory_node_info(Node* node) { - int r = BODY_IS_EMPTY; + int r = BODY_IS_EMPTY_POSSIBILITY; switch (NODE_TYPE(node)) { case NODE_LIST: @@ -4022,7 +4445,7 @@ quantifiers_memory_node_info(Node* node) #ifdef USE_CALL case NODE_CALL: if (NODE_IS_RECURSION(node)) { - return BODY_IS_EMPTY_REC; /* tiny version */ + return BODY_IS_EMPTY_POSSIBILITY_REC; /* tiny version */ } else r = quantifiers_memory_node_info(NODE_BODY(node)); @@ -4044,9 +4467,9 @@ quantifiers_memory_node_info(Node* node) switch (en->type) { case BAG_MEMORY: if (NODE_IS_RECURSION(node)) { - return BODY_IS_EMPTY_REC; + return BODY_IS_EMPTY_POSSIBILITY_REC; } - return BODY_IS_EMPTY_MEM; + return BODY_IS_EMPTY_POSSIBILITY_MEM; break; case BAG_OPTION: @@ -4083,7 +4506,7 @@ quantifiers_memory_node_info(Node* node) return r; } -#endif /* USE_INSISTENT_CHECK_CAPTURES_IN_EMPTY_REPEAT */ +#endif /* USE_STUBBORN_CHECK_CAPTURES_IN_EMPTY_REPEAT */ #ifdef USE_CALL @@ -4092,7 +4515,7 @@ quantifiers_memory_node_info(Node* node) __inline #endif static int -setup_call_node_call(CallNode* cn, ScanEnv* env, int state) +tune_call_node_call(CallNode* cn, ScanEnv* env, int state) { MemEnv* mem_env = SCANENV_MEMENV(env); @@ -4112,7 +4535,7 @@ setup_call_node_call(CallNode* cn, ScanEnv* env, int state) } set_call_attr: - NODE_CALL_BODY(cn) = mem_env[cn->group_num].node; + NODE_CALL_BODY(cn) = mem_env[cn->group_num].mem_node; if (IS_NULL(NODE_CALL_BODY(cn))) { onig_scan_env_set_error_string(env, ONIGERR_UNDEFINED_NAME_REFERENCE, cn->name, cn->name_end); @@ -4143,23 +4566,23 @@ setup_call_node_call(CallNode* cn, ScanEnv* env, int state) } static void -setup_call2_call(Node* node) +tune_call2_call(Node* node) { switch (NODE_TYPE(node)) { case NODE_LIST: case NODE_ALT: do { - setup_call2_call(NODE_CAR(node)); + tune_call2_call(NODE_CAR(node)); } while (IS_NOT_NULL(node = NODE_CDR(node))); break; case NODE_QUANT: - setup_call2_call(NODE_BODY(node)); + tune_call2_call(NODE_BODY(node)); break; case NODE_ANCHOR: if (ANCHOR_HAS_BODY(ANCHOR_(node))) - setup_call2_call(NODE_BODY(node)); + tune_call2_call(NODE_BODY(node)); break; case NODE_BAG: @@ -4169,19 +4592,19 @@ setup_call2_call(Node* node) if (en->type == BAG_MEMORY) { if (! NODE_IS_MARK1(node)) { NODE_STATUS_ADD(node, MARK1); - setup_call2_call(NODE_BODY(node)); + tune_call2_call(NODE_BODY(node)); NODE_STATUS_REMOVE(node, MARK1); } } else if (en->type == BAG_IF_ELSE) { - setup_call2_call(NODE_BODY(node)); + tune_call2_call(NODE_BODY(node)); if (IS_NOT_NULL(en->te.Then)) - setup_call2_call(en->te.Then); + tune_call2_call(en->te.Then); if (IS_NOT_NULL(en->te.Else)) - setup_call2_call(en->te.Else); + tune_call2_call(en->te.Else); } else { - setup_call2_call(NODE_BODY(node)); + tune_call2_call(NODE_BODY(node)); } } break; @@ -4197,7 +4620,7 @@ setup_call2_call(Node* node) NODE_STATUS_ADD(called, CALLED); BAG_(called)->m.entry_count++; - setup_call2_call(called); + tune_call2_call(called); } NODE_STATUS_REMOVE(node, MARK1); } @@ -4209,7 +4632,7 @@ setup_call2_call(Node* node) } static int -setup_call(Node* node, ScanEnv* env, int state) +tune_call(Node* node, ScanEnv* env, int state) { int r; @@ -4217,7 +4640,7 @@ setup_call(Node* node, ScanEnv* env, int state) case NODE_LIST: case NODE_ALT: do { - r = setup_call(NODE_CAR(node), env, state); + r = tune_call(NODE_CAR(node), env, state); } while (r == 0 && IS_NOT_NULL(node = NODE_CDR(node))); break; @@ -4225,12 +4648,12 @@ setup_call(Node* node, ScanEnv* env, int state) if (QUANT_(node)->upper == 0) state |= IN_ZERO_REPEAT; - r = setup_call(NODE_BODY(node), env, state); + r = tune_call(NODE_BODY(node), env, state); break; case NODE_ANCHOR: if (ANCHOR_HAS_BODY(ANCHOR_(node))) - r = setup_call(NODE_BODY(node), env, state); + r = tune_call(NODE_BODY(node), env, state); else r = 0; break; @@ -4244,20 +4667,20 @@ setup_call(Node* node, ScanEnv* env, int state) NODE_STATUS_ADD(node, IN_ZERO_REPEAT); BAG_(node)->m.entry_count--; } - r = setup_call(NODE_BODY(node), env, state); + r = tune_call(NODE_BODY(node), env, state); } else if (en->type == BAG_IF_ELSE) { - r = setup_call(NODE_BODY(node), env, state); + r = tune_call(NODE_BODY(node), env, state); if (r != 0) return r; if (IS_NOT_NULL(en->te.Then)) { - r = setup_call(en->te.Then, env, state); + r = tune_call(en->te.Then, env, state); if (r != 0) return r; } if (IS_NOT_NULL(en->te.Else)) - r = setup_call(en->te.Else, env, state); + r = tune_call(en->te.Else, env, state); } else - r = setup_call(NODE_BODY(node), env, state); + r = tune_call(NODE_BODY(node), env, state); } break; @@ -4267,7 +4690,7 @@ setup_call(Node* node, ScanEnv* env, int state) CALL_(node)->entry_count--; } - r = setup_call_node_call(CALL_(node), env, state); + r = tune_call_node_call(CALL_(node), env, state); break; default: @@ -4279,7 +4702,7 @@ setup_call(Node* node, ScanEnv* env, int state) } static int -setup_call2(Node* node) +tune_call2(Node* node) { int r = 0; @@ -4287,23 +4710,23 @@ setup_call2(Node* node) case NODE_LIST: case NODE_ALT: do { - r = setup_call2(NODE_CAR(node)); + r = tune_call2(NODE_CAR(node)); } while (r == 0 && IS_NOT_NULL(node = NODE_CDR(node))); break; case NODE_QUANT: if (QUANT_(node)->upper != 0) - r = setup_call2(NODE_BODY(node)); + r = tune_call2(NODE_BODY(node)); break; case NODE_ANCHOR: if (ANCHOR_HAS_BODY(ANCHOR_(node))) - r = setup_call2(NODE_BODY(node)); + r = tune_call2(NODE_BODY(node)); break; case NODE_BAG: if (! NODE_IS_IN_ZERO_REPEAT(node)) - r = setup_call2(NODE_BODY(node)); + r = tune_call2(NODE_BODY(node)); { BagNode* en = BAG_(node); @@ -4311,18 +4734,18 @@ setup_call2(Node* node) if (r != 0) return r; if (en->type == BAG_IF_ELSE) { if (IS_NOT_NULL(en->te.Then)) { - r = setup_call2(en->te.Then); + r = tune_call2(en->te.Then); if (r != 0) return r; } if (IS_NOT_NULL(en->te.Else)) - r = setup_call2(en->te.Else); + r = tune_call2(en->te.Else); } } break; case NODE_CALL: if (! NODE_IS_IN_ZERO_REPEAT(node)) { - setup_call2_call(node); + tune_call2_call(node); } break; @@ -4335,7 +4758,7 @@ setup_call2(Node* node) static void -setup_called_state_call(Node* node, int state) +tune_called_state_call(Node* node, int state) { switch (NODE_TYPE(node)) { case NODE_ALT: @@ -4343,7 +4766,7 @@ setup_called_state_call(Node* node, int state) /* fall */ case NODE_LIST: do { - setup_called_state_call(NODE_CAR(node), state); + tune_called_state_call(NODE_CAR(node), state); } while (IS_NOT_NULL(node = NODE_CDR(node))); break; @@ -4351,12 +4774,12 @@ setup_called_state_call(Node* node, int state) { QuantNode* qn = QUANT_(node); - if (IS_REPEAT_INFINITE(qn->upper) || qn->upper >= 2) + if (IS_INFINITE_REPEAT(qn->upper) || qn->upper >= 2) state |= IN_REAL_REPEAT; if (qn->lower != qn->upper) state |= IN_VAR_REPEAT; - setup_called_state_call(NODE_QUANT_BODY(qn), state); + tune_called_state_call(NODE_QUANT_BODY(qn), state); } break; @@ -4371,7 +4794,7 @@ setup_called_state_call(Node* node, int state) /* fall */ case ANCR_PREC_READ: case ANCR_LOOK_BEHIND: - setup_called_state_call(NODE_ANCHOR_BODY(an), state); + tune_called_state_call(NODE_ANCHOR_BODY(an), state); break; default: break; @@ -4387,31 +4810,33 @@ setup_called_state_call(Node* node, int state) if (NODE_IS_MARK1(node)) { if ((~en->m.called_state & state) != 0) { en->m.called_state |= state; - setup_called_state_call(NODE_BODY(node), state); + tune_called_state_call(NODE_BODY(node), state); } } else { NODE_STATUS_ADD(node, MARK1); en->m.called_state |= state; - setup_called_state_call(NODE_BODY(node), state); + tune_called_state_call(NODE_BODY(node), state); NODE_STATUS_REMOVE(node, MARK1); } } else if (en->type == BAG_IF_ELSE) { + state |= IN_ALT; + tune_called_state_call(NODE_BODY(node), state); if (IS_NOT_NULL(en->te.Then)) { - setup_called_state_call(en->te.Then, state); + tune_called_state_call(en->te.Then, state); } if (IS_NOT_NULL(en->te.Else)) - setup_called_state_call(en->te.Else, state); + tune_called_state_call(en->te.Else, state); } else { - setup_called_state_call(NODE_BODY(node), state); + tune_called_state_call(NODE_BODY(node), state); } } break; case NODE_CALL: - setup_called_state_call(NODE_BODY(node), state); + tune_called_state_call(NODE_BODY(node), state); break; default: @@ -4420,7 +4845,7 @@ setup_called_state_call(Node* node, int state) } static void -setup_called_state(Node* node, int state) +tune_called_state(Node* node, int state) { switch (NODE_TYPE(node)) { case NODE_ALT: @@ -4428,13 +4853,13 @@ setup_called_state(Node* node, int state) /* fall */ case NODE_LIST: do { - setup_called_state(NODE_CAR(node), state); + tune_called_state(NODE_CAR(node), state); } while (IS_NOT_NULL(node = NODE_CDR(node))); break; #ifdef USE_CALL case NODE_CALL: - setup_called_state_call(node, state); + tune_called_state_call(node, state); break; #endif @@ -4451,14 +4876,15 @@ setup_called_state(Node* node, int state) /* fall */ case BAG_OPTION: case BAG_STOP_BACKTRACK: - setup_called_state(NODE_BODY(node), state); + tune_called_state(NODE_BODY(node), state); break; case BAG_IF_ELSE: - setup_called_state(NODE_BODY(node), state); + state |= IN_ALT; + tune_called_state(NODE_BODY(node), state); if (IS_NOT_NULL(en->te.Then)) - setup_called_state(en->te.Then, state); + tune_called_state(en->te.Then, state); if (IS_NOT_NULL(en->te.Else)) - setup_called_state(en->te.Else, state); + tune_called_state(en->te.Else, state); break; } } @@ -4468,12 +4894,12 @@ setup_called_state(Node* node, int state) { QuantNode* qn = QUANT_(node); - if (IS_REPEAT_INFINITE(qn->upper) || qn->upper >= 2) + if (IS_INFINITE_REPEAT(qn->upper) || qn->upper >= 2) state |= IN_REAL_REPEAT; if (qn->lower != qn->upper) state |= IN_VAR_REPEAT; - setup_called_state(NODE_QUANT_BODY(qn), state); + tune_called_state(NODE_QUANT_BODY(qn), state); } break; @@ -4488,7 +4914,7 @@ setup_called_state(Node* node, int state) /* fall */ case ANCR_PREC_READ: case ANCR_LOOK_BEHIND: - setup_called_state(NODE_ANCHOR_BODY(an), state); + tune_called_state(NODE_ANCHOR_BODY(an), state); break; default: break; @@ -4509,13 +4935,13 @@ setup_called_state(Node* node, int state) #endif /* USE_CALL */ -static int setup_tree(Node* node, regex_t* reg, int state, ScanEnv* env); +static int tune_tree(Node* node, regex_t* reg, int state, ScanEnv* env); #ifdef __GNUC__ __inline #endif static int -setup_anchor(Node* node, regex_t* reg, int state, ScanEnv* env) +tune_anchor(Node* node, regex_t* reg, int state, ScanEnv* env) { /* allowed node types in look-behind */ #define ALLOWED_TYPE_IN_LB \ @@ -4543,10 +4969,10 @@ setup_anchor(Node* node, regex_t* reg, int state, ScanEnv* env) switch (an->type) { case ANCR_PREC_READ: - r = setup_tree(NODE_ANCHOR_BODY(an), reg, state, env); + r = tune_tree(NODE_ANCHOR_BODY(an), reg, state, env); break; case ANCR_PREC_READ_NOT: - r = setup_tree(NODE_ANCHOR_BODY(an), reg, (state | IN_NOT), env); + r = tune_tree(NODE_ANCHOR_BODY(an), reg, (state | IN_NOT), env); break; case ANCR_LOOK_BEHIND: @@ -4555,9 +4981,9 @@ setup_anchor(Node* node, regex_t* reg, int state, ScanEnv* env) ALLOWED_BAG_IN_LB, ALLOWED_ANCHOR_IN_LB); if (r < 0) return r; if (r > 0) return ONIGERR_INVALID_LOOK_BEHIND_PATTERN; - r = setup_tree(NODE_ANCHOR_BODY(an), reg, (state|IN_LOOK_BEHIND), env); + r = tune_tree(NODE_ANCHOR_BODY(an), reg, (state|IN_LOOK_BEHIND), env); if (r != 0) return r; - r = setup_look_behind(node, reg, env); + r = tune_look_behind(node, reg, env); } break; @@ -4567,10 +4993,10 @@ setup_anchor(Node* node, regex_t* reg, int state, ScanEnv* env) ALLOWED_BAG_IN_LB_NOT, ALLOWED_ANCHOR_IN_LB_NOT); if (r < 0) return r; if (r > 0) return ONIGERR_INVALID_LOOK_BEHIND_PATTERN; - r = setup_tree(NODE_ANCHOR_BODY(an), reg, (state|IN_NOT|IN_LOOK_BEHIND), - env); + r = tune_tree(NODE_ANCHOR_BODY(an), reg, (state|IN_NOT|IN_LOOK_BEHIND), + env); if (r != 0) return r; - r = setup_look_behind(node, reg, env); + r = tune_look_behind(node, reg, env); } break; @@ -4586,7 +5012,7 @@ setup_anchor(Node* node, regex_t* reg, int state, ScanEnv* env) __inline #endif static int -setup_quant(Node* node, regex_t* reg, int state, ScanEnv* env) +tune_quant(Node* node, regex_t* reg, int state, ScanEnv* env) { int r; OnigLen d; @@ -4600,44 +5026,37 @@ setup_quant(Node* node, regex_t* reg, int state, ScanEnv* env) NODE_STATUS_ADD(node, IN_MULTI_ENTRY); } - if (IS_REPEAT_INFINITE(qn->upper) || qn->upper >= 1) { + if (IS_INFINITE_REPEAT(qn->upper) || qn->upper >= 1) { d = tree_min_len(body, env); if (d == 0) { -#ifdef USE_INSISTENT_CHECK_CAPTURES_IN_EMPTY_REPEAT - qn->empty_info = quantifiers_memory_node_info(body); - if (qn->empty_info == BODY_IS_EMPTY_REC) { - if (NODE_TYPE(body) == NODE_BAG && - BAG_(body)->type == BAG_MEMORY) { - MEM_STATUS_ON(env->bt_mem_end, BAG_(body)->m.regnum); - } - } +#ifdef USE_STUBBORN_CHECK_CAPTURES_IN_EMPTY_REPEAT + qn->emptiness = quantifiers_memory_node_info(body); #else - qn->empty_info = BODY_IS_EMPTY; + qn->emptiness = BODY_IS_EMPTY_POSSIBILITY; #endif } } - if (IS_REPEAT_INFINITE(qn->upper) || qn->upper >= 2) + if (IS_INFINITE_REPEAT(qn->upper) || qn->upper >= 2) state |= IN_REAL_REPEAT; if (qn->lower != qn->upper) state |= IN_VAR_REPEAT; - r = setup_tree(body, reg, state, env); + r = tune_tree(body, reg, state, env); if (r != 0) return r; /* expand string */ #define EXPAND_STRING_MAX_LENGTH 100 if (NODE_TYPE(body) == NODE_STRING) { - if (!IS_REPEAT_INFINITE(qn->lower) && qn->lower == qn->upper && + if (!IS_INFINITE_REPEAT(qn->lower) && qn->lower == qn->upper && qn->lower > 1 && qn->lower <= EXPAND_STRING_MAX_LENGTH) { int len = NODE_STRING_LEN(body); - StrNode* sn = STR_(body); if (len * qn->lower <= EXPAND_STRING_MAX_LENGTH) { int i, n = qn->lower; - onig_node_conv_to_str_node(node, STR_(body)->flag); + node_conv_to_str_node(node, STR_(body)->flag); for (i = 0; i < n; i++) { - r = onig_node_str_cat(node, sn->s, sn->end); + r = node_str_node_cat(node, body); if (r != 0) return r; } onig_node_free(body); @@ -4646,7 +5065,7 @@ setup_quant(Node* node, regex_t* reg, int state, ScanEnv* env) } } - if (qn->greedy && (qn->empty_info == BODY_IS_NOT_EMPTY)) { + if (qn->greedy && (qn->emptiness == BODY_IS_NOT_EMPTY)) { if (NODE_TYPE(body) == NODE_QUANT) { QuantNode* tqn = QUANT_(body); if (IS_NOT_NULL(tqn->head_exact)) { @@ -4662,8 +5081,8 @@ setup_quant(Node* node, regex_t* reg, int state, ScanEnv* env) return r; } -/* setup_tree does the following work. - 1. check empty loop. (set qn->empty_info) +/* tune_tree does the following work. + 1. check empty loop. (set qn->emptiness) 2. expand ignore-case in char class. 3. set memory status bit flags. (reg->mem_stats) 4. set qn->head_exact for [push, exact] -> [push_or_jump_exact1, exact]. @@ -4671,7 +5090,7 @@ setup_quant(Node* node, regex_t* reg, int state, ScanEnv* env) 6. expand repeated string. */ static int -setup_tree(Node* node, regex_t* reg, int state, ScanEnv* env) +tune_tree(Node* node, regex_t* reg, int state, ScanEnv* env) { int r = 0; @@ -4680,9 +5099,9 @@ setup_tree(Node* node, regex_t* reg, int state, ScanEnv* env) { Node* prev = NULL_NODE; do { - r = setup_tree(NODE_CAR(node), reg, state, env); + r = tune_tree(NODE_CAR(node), reg, state, env); if (IS_NOT_NULL(prev) && r == 0) { - r = next_setup(prev, NODE_CAR(node), reg); + r = tune_next(prev, NODE_CAR(node), reg); } prev = NODE_CAR(node); } while (r == 0 && IS_NOT_NULL(node = NODE_CDR(node))); @@ -4691,13 +5110,13 @@ setup_tree(Node* node, regex_t* reg, int state, ScanEnv* env) case NODE_ALT: do { - r = setup_tree(NODE_CAR(node), reg, (state | IN_ALT), env); + r = tune_tree(NODE_CAR(node), reg, (state | IN_ALT), env); } while (r == 0 && IS_NOT_NULL(node = NODE_CDR(node))); break; case NODE_STRING: - if (IS_IGNORECASE(reg->options) && !NODE_STRING_IS_RAW(node)) { - r = expand_case_fold_string(node, reg, state); + if (IS_IGNORECASE(reg->options) && !NODE_STRING_IS_CRUDE(node)) { + r = unravel_case_fold_string(node, reg, state); } break; @@ -4710,12 +5129,18 @@ setup_tree(Node* node, regex_t* reg, int state, ScanEnv* env) for (i = 0; i < br->back_num; i++) { if (p[i] > env->num_mem) return ONIGERR_INVALID_BACKREF; MEM_STATUS_ON(env->backrefed_mem, p[i]); - MEM_STATUS_ON(env->bt_mem_start, p[i]); +#if 0 #ifdef USE_BACKREF_WITH_LEVEL if (NODE_IS_NEST_LEVEL(node)) { - MEM_STATUS_ON(env->bt_mem_end, p[i]); + MEM_STATUS_ON(env->backtrack_mem, p[i]); } #endif +#else + /* More precisely, it should be checked whether alt/repeat exists before + the subject capture node, and then this backreference position + exists before (or in) the capture node. */ + MEM_STATUS_ON(env->backtrack_mem, p[i]); +#endif } } break; @@ -4729,7 +5154,7 @@ setup_tree(Node* node, regex_t* reg, int state, ScanEnv* env) { OnigOptionType options = reg->options; reg->options = BAG_(node)->o.options; - r = setup_tree(NODE_BODY(node), reg, state, env); + r = tune_tree(NODE_BODY(node), reg, state, env); reg->options = options; } break; @@ -4741,46 +5166,46 @@ setup_tree(Node* node, regex_t* reg, int state, ScanEnv* env) if ((state & (IN_ALT | IN_NOT | IN_VAR_REPEAT | IN_MULTI_ENTRY)) != 0 || NODE_IS_RECURSION(node)) { - MEM_STATUS_ON(env->bt_mem_start, en->m.regnum); + MEM_STATUS_ON(env->backtrack_mem, en->m.regnum); } - r = setup_tree(NODE_BODY(node), reg, state, env); + r = tune_tree(NODE_BODY(node), reg, state, env); break; case BAG_STOP_BACKTRACK: { Node* target = NODE_BODY(node); - r = setup_tree(target, reg, state, env); + r = tune_tree(target, reg, state, env); if (NODE_TYPE(target) == NODE_QUANT) { QuantNode* tqn = QUANT_(target); - if (IS_REPEAT_INFINITE(tqn->upper) && tqn->lower <= 1 && + if (IS_INFINITE_REPEAT(tqn->upper) && tqn->lower <= 1 && tqn->greedy != 0) { /* (?>a*), a*+ etc... */ - if (NODE_IS_SIMPLE_TYPE(NODE_BODY(target))) - NODE_STATUS_ADD(node, STOP_BT_SIMPLE_REPEAT); + if (is_strict_real_node(NODE_BODY(target))) + NODE_STATUS_ADD(node, STRICT_REAL_REPEAT); } } } break; case BAG_IF_ELSE: - r = setup_tree(NODE_BODY(node), reg, (state | IN_ALT), env); + r = tune_tree(NODE_BODY(node), reg, (state | IN_ALT), env); if (r != 0) return r; if (IS_NOT_NULL(en->te.Then)) { - r = setup_tree(en->te.Then, reg, (state | IN_ALT), env); + r = tune_tree(en->te.Then, reg, (state | IN_ALT), env); if (r != 0) return r; } if (IS_NOT_NULL(en->te.Else)) - r = setup_tree(en->te.Else, reg, (state | IN_ALT), env); + r = tune_tree(en->te.Else, reg, (state | IN_ALT), env); break; } } break; case NODE_QUANT: - r = setup_quant(node, reg, state, env); + r = tune_quant(node, reg, state, env); break; case NODE_ANCHOR: - r = setup_anchor(node, reg, state, env); + r = tune_anchor(node, reg, state, env); break; #ifdef USE_CALL @@ -4879,7 +5304,7 @@ typedef struct { } MinMax; typedef struct { - MinMax mmd; + MinMax mm; OnigEncoding enc; OnigOptionType options; OnigCaseFoldType case_fold_flag; @@ -4892,17 +5317,16 @@ typedef struct { } OptAnc; typedef struct { - MinMax mmd; /* position */ + MinMax mm; /* position */ OptAnc anc; int reach_end; int case_fold; - int good_case_fold; int len; UChar s[OPT_EXACT_MAXLEN]; } OptStr; typedef struct { - MinMax mmd; /* position */ + MinMax mm; /* position */ OptAnc anc; int value; /* weighted value */ UChar map[CHAR_MAP_SIZE]; @@ -5119,11 +5543,10 @@ is_full_opt_exact(OptStr* e) static void clear_opt_exact(OptStr* e) { - clear_mml(&e->mmd); + clear_mml(&e->mm); clear_opt_anc_info(&e->anc); e->reach_end = 0; e->case_fold = 0; - e->good_case_fold = 0; e->len = 0; e->s[0] = '\0'; } @@ -5147,11 +5570,6 @@ concat_opt_exact(OptStr* to, OptStr* add, OnigEncoding enc) to->case_fold = 1; } - else { - if (to->good_case_fold != 0) { - if (add->good_case_fold == 0) return 0; - } - } } r = 0; @@ -5206,7 +5624,7 @@ alt_merge_opt_exact(OptStr* to, OptStr* add, OptEnv* env) return ; } - if (! is_equal_mml(&to->mmd, &add->mmd)) { + if (! is_equal_mml(&to->mm, &add->mm)) { clear_opt_exact(to); return ; } @@ -5228,8 +5646,6 @@ alt_merge_opt_exact(OptStr* to, OptStr* add, OptEnv* env) to->len = i; if (add->case_fold != 0) to->case_fold = 1; - if (add->good_case_fold == 0) - to->good_case_fold = 0; alt_merge_opt_anc_info(&to->anc, &add->anc); if (! to->reach_end) to->anc.right = 0; @@ -5262,10 +5678,7 @@ select_opt_exact(OnigEncoding enc, OptStr* now, OptStr* alt) if (now->case_fold == 0) vn *= 2; if (alt->case_fold == 0) va *= 2; - if (now->good_case_fold != 0) vn *= 4; - if (alt->good_case_fold != 0) va *= 4; - - if (comp_distance_value(&now->mmd, &alt->mmd, vn, va) > 0) + if (comp_distance_value(&now->mm, &alt->mm, vn, va) > 0) copy_opt_exact(now, alt); } @@ -5349,7 +5762,7 @@ select_opt_map(OptMap* now, OptMap* alt) vn = z / now->value; va = z / alt->value; - if (comp_distance_value(&now->mmd, &alt->mmd, vn, va) > 0) + if (comp_distance_value(&now->mm, &alt->mm, vn, va) > 0) copy_opt_map(now, alt); } @@ -5363,17 +5776,14 @@ comp_opt_exact_or_map(OptStr* e, OptMap* m) if (m->value <= 0) return -1; if (e->case_fold != 0) { - if (e->good_case_fold != 0) - case_value = 2; - else - case_value = 1; + case_value = 1; } else case_value = 3; ae = COMP_EM_BASE * e->len * case_value; am = COMP_EM_BASE * 5 * 2 / m->value; - return comp_distance_value(&e->mmd, &m->mmd, ae, am); + return comp_distance_value(&e->mm, &m->mm, ae, am); } static void @@ -5381,14 +5791,14 @@ alt_merge_opt_map(OnigEncoding enc, OptMap* to, OptMap* add) { int i, val; - /* if (! is_equal_mml(&to->mmd, &add->mmd)) return ; */ + /* if (! is_equal_mml(&to->mm, &add->mm)) return ; */ if (to->value == 0) return ; - if (add->value == 0 || to->mmd.max < add->mmd.min) { + if (add->value == 0 || to->mm.max < add->mm.min) { clear_opt_map(to); return ; } - alt_merge_mml(&to->mmd, &add->mmd); + alt_merge_mml(&to->mm, &add->mm); val = 0; for (i = 0; i < CHAR_MAP_SIZE; i++) { @@ -5406,9 +5816,9 @@ alt_merge_opt_map(OnigEncoding enc, OptMap* to, OptMap* add) static void set_bound_node_opt_info(OptNode* opt, MinMax* plen) { - copy_mml(&(opt->sb.mmd), plen); - copy_mml(&(opt->spr.mmd), plen); - copy_mml(&(opt->map.mmd), plen); + copy_mml(&(opt->sb.mm), plen); + copy_mml(&(opt->spr.mm), plen); + copy_mml(&(opt->map.mm), plen); } static void @@ -5443,7 +5853,7 @@ concat_left_node_opt_info(OnigEncoding enc, OptNode* to, OptNode* add) } if (add->map.value > 0 && to->len.max == 0) { - if (add->map.mmd.max == 0) + if (add->map.mm.max == 0) add->map.anc.left |= to->anc.left; } @@ -5468,10 +5878,7 @@ concat_left_node_opt_info(OnigEncoding enc, OptNode* to, OptNode* add) if (to->spr.len > 0) { if (add->len.max > 0) { - if (to->spr.len > (int )add->len.max) - to->spr.len = add->len.max; - - if (to->spr.mmd.max == 0) + if (to->spr.mm.max == 0) select_opt_exact(enc, &to->sb, &to->spr); else select_opt_exact(enc, &to->sm, &to->spr); @@ -5511,7 +5918,7 @@ optimize_nodes(Node* node, OptNode* opt, OptEnv* env) r = 0; enc = env->enc; clear_node_opt_info(opt); - set_bound_node_opt_info(opt, &env->mmd); + set_bound_node_opt_info(opt, &env->mm); switch (NODE_TYPE(node)) { case NODE_LIST: @@ -5523,7 +5930,7 @@ optimize_nodes(Node* node, OptNode* opt, OptEnv* env) do { r = optimize_nodes(NODE_CAR(nd), &xo, &nenv); if (r == 0) { - add_mml(&nenv.mmd, &xo.len); + add_mml(&nenv.mm, &xo.len); concat_left_node_opt_info(enc, opt, &xo); } } while (r == 0 && IS_NOT_NULL(nd = NODE_CDR(nd))); @@ -5548,9 +5955,8 @@ optimize_nodes(Node* node, OptNode* opt, OptEnv* env) { StrNode* sn = STR_(node); int slen = (int )(sn->end - sn->s); - /* int is_raw = NODE_STRING_IS_RAW(node); */ - if (! NODE_STRING_IS_AMBIG(node)) { + if (! NODE_STRING_IS_CASE_FOLD_MATCH(node)) { concat_opt_exact_str(&opt->sb, sn->s, sn->end, enc); if (slen > 0) { add_char_opt_map(&opt->map, *(sn->s), enc); @@ -5558,28 +5964,20 @@ optimize_nodes(Node* node, OptNode* opt, OptEnv* env) set_mml(&opt->len, slen, slen); } else { - int max; + int max, min; - if (NODE_STRING_IS_DONT_GET_OPT_INFO(node)) { - int n = onigenc_strlen(enc, sn->s, sn->end); - max = ONIGENC_MBC_MAXLEN_DIST(enc) * n; - } - else { - concat_opt_exact_str(&opt->sb, sn->s, sn->end, enc); - opt->sb.case_fold = 1; - if (NODE_STRING_IS_GOOD_AMBIG(node)) - opt->sb.good_case_fold = 1; - - if (slen > 0) { - r = add_char_amb_opt_map(&opt->map, sn->s, sn->end, - enc, env->case_fold_flag); - if (r != 0) break; - } + concat_opt_exact_str(&opt->sb, sn->s, sn->end, enc); + opt->sb.case_fold = 1; - max = slen; + if (slen > 0) { + r = add_char_amb_opt_map(&opt->map, sn->s, sn->end, + enc, env->case_fold_flag); + if (r != 0) break; } - set_mml(&opt->len, slen, max); + max = slen; + min = sn->case_min_len * ONIGENC_MBC_MINLEN(enc); + set_mml(&opt->len, min, max); } } break; @@ -5589,7 +5987,7 @@ optimize_nodes(Node* node, OptNode* opt, OptEnv* env) int z; CClassNode* cc = CCLASS_(node); - /* no need to check ignore case. (set in setup_tree()) */ + /* no need to check ignore case. (set in tune_tree()) */ if (IS_NOT_NULL(cc->mbuf) || IS_NCCLASS_NOT(cc)) { OnigLen min = ONIGENC_MBC_MINLEN(enc); @@ -5699,11 +6097,11 @@ optimize_nodes(Node* node, OptNode* opt, OptEnv* env) break; } backs = BACKREFS_P(br); - min = tree_min_len(mem_env[backs[0]].node, env->scan_env); - max = tree_max_len(mem_env[backs[0]].node, env->scan_env); + min = tree_min_len(mem_env[backs[0]].mem_node, env->scan_env); + max = tree_max_len(mem_env[backs[0]].mem_node, env->scan_env); for (i = 1; i < br->back_num; i++) { - tmin = tree_min_len(mem_env[backs[i]].node, env->scan_env); - tmax = tree_max_len(mem_env[backs[i]].node, env->scan_env); + tmin = tree_min_len(mem_env[backs[i]].mem_node, env->scan_env); + tmax = tree_max_len(mem_env[backs[i]].mem_node, env->scan_env); if (min > tmin) min = tmin; if (max < tmax) max = tmax; } @@ -5752,8 +6150,8 @@ optimize_nodes(Node* node, OptNode* opt, OptEnv* env) opt->sm.reach_end = 0; } - if (IS_REPEAT_INFINITE(qn->upper)) { - if (env->mmd.max == 0 && + if (IS_INFINITE_REPEAT(qn->upper)) { + if (env->mm.max == 0 && NODE_IS_ANYCHAR(NODE_BODY(node)) && qn->greedy != 0) { if (IS_MULTILINE(CTYPE_OPTION(NODE_QUANT_BODY(qn), env))) add_opt_anc_info(&opt->anc, ANCR_ANYCHAR_INF_ML); @@ -5821,7 +6219,7 @@ optimize_nodes(Node* node, OptNode* opt, OptEnv* env) copy_opt_env(&nenv, env); r = optimize_nodes(NODE_BAG_BODY(en), &xo, &nenv); if (r == 0) { - add_mml(&nenv.mmd, &xo.len); + add_mml(&nenv.mm, &xo.len); concat_left_node_opt_info(enc, opt, &xo); if (IS_NOT_NULL(en->te.Then)) { r = optimize_nodes(en->te.Then, &xo, &nenv); @@ -5870,15 +6268,6 @@ set_optimize_exact(regex_t* reg, OptStr* e) if (e->case_fold) { reg->optimize = OPTIMIZE_STR_CASE_FOLD; - if (e->good_case_fold != 0) { - if (e->len >= 2) { - r = set_sunday_quick_search_or_bmh_skip_table(reg, 1, - reg->exact, reg->exact_end, - reg->map, &(reg->map_offset)); - if (r != 0) return r; - reg->optimize = OPTIMIZE_STR_CASE_FOLD_FAST; - } - } } else { int allow_reverse; @@ -5901,11 +6290,17 @@ set_optimize_exact(regex_t* reg, OptStr* e) } } - reg->dmin = e->mmd.min; - reg->dmax = e->mmd.max; + reg->dist_min = e->mm.min; + reg->dist_max = e->mm.max; + + if (reg->dist_min != INFINITE_LEN) { + int n; + if (e->case_fold != 0) + n = 1; + else + n = (int )(reg->exact_end - reg->exact); - if (reg->dmin != INFINITE_LEN) { - reg->threshold_len = reg->dmin + (int )(reg->exact_end - reg->exact); + reg->threshold_len = reg->dist_min + n; } return 0; @@ -5920,11 +6315,11 @@ set_optimize_map(regex_t* reg, OptMap* m) reg->map[i] = m->map[i]; reg->optimize = OPTIMIZE_MAP; - reg->dmin = m->mmd.min; - reg->dmax = m->mmd.max; + reg->dist_min = m->mm.min; + reg->dist_max = m->mm.max; - if (reg->dmin != INFINITE_LEN) { - reg->threshold_len = reg->dmin + 1; + if (reg->dist_min != INFINITE_LEN) { + reg->threshold_len = reg->dist_min + 1; } } @@ -5950,7 +6345,7 @@ set_optimize_info_from_tree(Node* node, regex_t* reg, ScanEnv* scan_env) env.options = reg->options; env.case_fold_flag = reg->case_fold_flag; env.scan_env = scan_env; - clear_mml(&env.mmd); + clear_mml(&env.mm); r = optimize_nodes(node, &opt, &env); if (r != 0) return r; @@ -5966,8 +6361,8 @@ set_optimize_info_from_tree(Node* node, regex_t* reg, ScanEnv* scan_env) ANCR_PREC_READ_NOT); if (reg->anchor & (ANCR_END_BUF | ANCR_SEMI_END_BUF)) { - reg->anchor_dmin = opt.len.min; - reg->anchor_dmax = opt.len.max; + reg->anc_dist_min = opt.len.min; + reg->anc_dist_max = opt.len.max; } if (opt.sb.len > 0 || opt.sm.len > 0) { @@ -6002,8 +6397,8 @@ clear_optimize_info(regex_t* reg) { reg->optimize = OPTIMIZE_NONE; reg->anchor = 0; - reg->anchor_dmin = 0; - reg->anchor_dmax = 0; + reg->anc_dist_min = 0; + reg->anc_dist_max = 0; reg->sub_anchor = 0; reg->exact_end = (UChar* )NULL; reg->map_offset = 0; @@ -6122,12 +6517,12 @@ print_optimize_info(FILE* f, regex_t* reg) { static const char* on[] = { "NONE", "STR", "STR_FAST", "STR_FAST_STEP_FORWARD", - "STR_CASE_FOLD_FAST", "STR_CASE_FOLD", "MAP" }; + "STR_CASE_FOLD", "MAP" }; fprintf(f, "optimize: %s\n", on[reg->optimize]); fprintf(f, " anchor: "); print_anchor(f, reg->anchor); if ((reg->anchor & ANCR_END_BUF_MASK) != 0) - print_distance_range(f, reg->anchor_dmin, reg->anchor_dmax); + print_distance_range(f, reg->anc_dist_min, reg->anc_dist_max); fprintf(f, "\n"); if (reg->optimize) { @@ -6275,7 +6670,7 @@ onig_compile(regex_t* reg, const UChar* pattern, const UChar* pattern_end, Node* root; ScanEnv scan_env; #ifdef USE_CALL - UnsetAddrList uslist; + UnsetAddrList uslist = {0}; #endif root = 0; @@ -6299,13 +6694,17 @@ onig_compile(regex_t* reg, const UChar* pattern, const UChar* pattern_end, reg->string_pool_end = 0; reg->num_mem = 0; reg->num_repeat = 0; - reg->num_null_check = 0; + reg->num_empty_check = 0; reg->repeat_range_alloc = 0; - reg->repeat_range = (OnigRepeatRange* )NULL; + reg->repeat_range = (RepeatRange* )NULL; + reg->empty_status_mem = 0; r = onig_parse_tree(&root, pattern, pattern_end, reg, &scan_env); if (r != 0) goto err; + r = reduce_string_list(root); + if (r != 0) goto err; + /* mixed use named group and no-named group */ if (scan_env.num_named > 0 && IS_SYNTAX_BV(scan_env.syntax, ONIG_SYN_CAPTURE_ONLY_NAMED_GROUP) && @@ -6326,38 +6725,65 @@ onig_compile(regex_t* reg, const UChar* pattern, const UChar* pattern_end, r = unset_addr_list_init(&uslist, scan_env.num_call); if (r != 0) goto err; scan_env.unset_addr_list = &uslist; - r = setup_call(root, &scan_env, 0); + r = tune_call(root, &scan_env, 0); if (r != 0) goto err_unset; - r = setup_call2(root); + r = tune_call2(root); if (r != 0) goto err_unset; r = recursive_call_check_trav(root, &scan_env, 0); if (r < 0) goto err_unset; r = infinite_recursive_call_check_trav(root, &scan_env); if (r != 0) goto err_unset; - setup_called_state(root, 0); + tune_called_state(root, 0); } reg->num_call = scan_env.num_call; #endif - r = setup_tree(root, reg, 0, &scan_env); +#ifdef ONIG_DEBUG_PARSE + fprintf(stderr, "MAX PARSE DEPTH: %d\n", scan_env.max_parse_depth); + fprintf(stderr, "TREE (parsed)\n"); + print_tree(stderr, root); + fprintf(stderr, "\n"); +#endif + + r = tune_tree(root, reg, 0, &scan_env); if (r != 0) goto err_unset; + if (scan_env.backref_num != 0) { + set_parent_node_trav(root, NULL_NODE); + r = set_empty_repeat_node_trav(root, NULL_NODE, &scan_env); + if (r != 0) goto err_unset; + set_empty_status_check_trav(root, &scan_env); + } + #ifdef ONIG_DEBUG_PARSE + fprintf(stderr, "TREE (after tune)\n"); print_tree(stderr, root); + fprintf(stderr, "\n"); #endif - reg->capture_history = scan_env.capture_history; - reg->bt_mem_start = scan_env.bt_mem_start; - reg->bt_mem_start |= reg->capture_history; - if (IS_FIND_CONDITION(reg->options)) - MEM_STATUS_ON_ALL(reg->bt_mem_end); + reg->capture_history = scan_env.cap_history; + reg->push_mem_start = scan_env.backtrack_mem | scan_env.cap_history; + +#ifdef USE_CALLOUT + if (IS_NOT_NULL(reg->extp) && reg->extp->callout_num != 0) { + reg->push_mem_end = reg->push_mem_start; + } else { - reg->bt_mem_end = scan_env.bt_mem_end; - reg->bt_mem_end |= reg->capture_history; + if (MEM_STATUS_IS_ALL_ON(reg->push_mem_start)) + reg->push_mem_end = scan_env.backrefed_mem | scan_env.cap_history; + else + reg->push_mem_end = reg->push_mem_start & + (scan_env.backrefed_mem | scan_env.cap_history); } - reg->bt_mem_start |= reg->bt_mem_end; +#else + if (MEM_STATUS_IS_ALL_ON(reg->push_mem_start)) + reg->push_mem_end = scan_env.backrefed_mem | scan_env.cap_history; + else + reg->push_mem_end = reg->push_mem_start & + (scan_env.backrefed_mem | scan_env.cap_history); +#endif clear_optimize_info(reg); #ifndef ONIG_DONT_OPTIMIZE @@ -6391,14 +6817,20 @@ onig_compile(regex_t* reg, const UChar* pattern, const UChar* pattern_end, } #endif - if ((reg->num_repeat != 0) || (reg->bt_mem_end != 0) + set_addr_in_repeat_range(reg); + + if ((reg->push_mem_end != 0) +#ifdef USE_REPEAT_AND_EMPTY_CHECK_LOCAL_VAR + || (reg->num_repeat != 0) + || (reg->num_empty_check != 0) +#endif #ifdef USE_CALLOUT || (IS_NOT_NULL(reg->extp) && reg->extp->callout_num != 0) #endif ) reg->stack_pop_level = STACK_POP_LEVEL_ALL; else { - if (reg->bt_mem_start != 0) + if (reg->push_mem_start != 0) reg->stack_pop_level = STACK_POP_LEVEL_MEM_START; else reg->stack_pop_level = STACK_POP_LEVEL_FREE; @@ -6531,11 +6963,14 @@ onig_new(regex_t** reg, const UChar* pattern, const UChar* pattern_end, if (IS_NULL(*reg)) return ONIGERR_MEMORY; r = onig_reg_init(*reg, option, ONIGENC_CASE_FOLD_DEFAULT, enc, syntax); - if (r != 0) goto err; + if (r != 0) { + xfree(*reg); + *reg = NULL; + return r; + } r = onig_compile(*reg, pattern, pattern_end, einfo); if (r != 0) { - err: onig_free(*reg); *reg = NULL; } @@ -6672,6 +7107,7 @@ onig_is_code_in_cc(OnigEncoding enc, OnigCodePoint code, CClassNode* cc) } else { len = ONIGENC_CODE_TO_MBCLEN(enc, code); + if (len < 0) return 0; } return onig_is_code_in_cc_len(len, code, cc); } @@ -6679,12 +7115,14 @@ onig_is_code_in_cc(OnigEncoding enc, OnigCodePoint code, CClassNode* cc) #ifdef ONIG_DEBUG_PARSE +#ifdef USE_CALL static void p_string(FILE* f, int len, UChar* s) { fputs(":", f); while (len-- > 0) { fputc(*s++, f); } } +#endif static void Indent(FILE* f, int indent) @@ -6704,7 +7142,7 @@ print_indent_tree(FILE* f, Node* node, int indent) Indent(f, indent); if (IS_NULL(node)) { fprintf(f, "ERROR: null node!!!\n"); - exit (0); + exit(0); } type = NODE_TYPE(node); @@ -6728,28 +7166,22 @@ print_indent_tree(FILE* f, Node* node, int indent) case NODE_STRING: { + char* str; char* mode; - char* dont; - char* good; - if (NODE_STRING_IS_RAW(node)) - mode = "-raw"; - else if (NODE_STRING_IS_AMBIG(node)) - mode = "-ambig"; + if (NODE_STRING_IS_CRUDE(node)) + mode = "-crude"; + else if (NODE_STRING_IS_CASE_FOLD_MATCH(node)) + mode = "-case_fold_match"; else mode = ""; - if (NODE_STRING_IS_GOOD_AMBIG(node)) - good = "-good"; - else - good = ""; - - if (NODE_STRING_IS_DONT_GET_OPT_INFO(node)) - dont = " (dont-opt)"; + if (STR_(node)->s == STR_(node)->end) + str = "empty-string"; else - dont = ""; + str = "string"; - fprintf(f, "<string%s%s%s:%p>", mode, good, dont, node); + fprintf(f, "<%s%s:%p>", str, mode, node); for (p = STR_(node)->s; p < STR_(node)->end; p++) { if (*p >= 0x20 && *p < 0x7f) fputc(*p, f); @@ -6871,6 +7303,34 @@ print_indent_tree(FILE* f, Node* node, int indent) case NODE_BAG: fprintf(f, "<bag:%p> ", node); + if (BAG_(node)->type == BAG_IF_ELSE) { + Node* Then; + Node* Else; + BagNode* bn; + + bn = BAG_(node); + fprintf(f, "if-else\n"); + print_indent_tree(f, NODE_BODY(node), indent + add); + + Then = bn->te.Then; + Else = bn->te.Else; + if (IS_NULL(Then)) { + Indent(f, indent + add); + fprintf(f, "THEN empty\n"); + } + else + print_indent_tree(f, Then, indent + add); + + if (IS_NULL(Else)) { + Indent(f, indent + add); + fprintf(f, "ELSE empty\n"); + } + else + print_indent_tree(f, Else, indent + add); + + break; + } + switch (BAG_(node)->type) { case BAG_OPTION: fprintf(f, "option:%d", BAG_(node)->o.options); @@ -6881,8 +7341,7 @@ print_indent_tree(FILE* f, Node* node, int indent) case BAG_STOP_BACKTRACK: fprintf(f, "stop-bt"); break; - case BAG_IF_ELSE: - fprintf(f, "if-else"); + default: break; } fprintf(f, "\n"); |