diff options
Diffstat (limited to 'src/regcomp.c')
-rw-r--r-- | src/regcomp.c | 2222 |
1 files changed, 1384 insertions, 838 deletions
diff --git a/src/regcomp.c b/src/regcomp.c index 69d4b95..4d5b78f 100644 --- a/src/regcomp.c +++ b/src/regcomp.c @@ -2,7 +2,7 @@ regcomp.c - Oniguruma (regular expression library) **********************************************************************/ /*- - * Copyright (c) 2002-2019 K.Kosako + * Copyright (c) 2002-2020 K.Kosako * All rights reserved. * * Redistribution and use in source and binary forms, with or without @@ -31,8 +31,21 @@ #define OPS_INIT_SIZE 8 +typedef struct { + OnigLen min; + OnigLen max; +} MinMaxLen; + +typedef struct { + OnigLen min; + OnigLen max; + int min_is_sure; +} MinMaxCharLen; + OnigCaseFoldType OnigDefaultCaseFoldFlag = ONIGENC_CASE_FOLD_MIN; +static OnigLen node_min_byte_len(Node* node, ScanEnv* env); + #if 0 typedef struct { int n; @@ -99,7 +112,7 @@ int_stack_pop(int_stack* s) #ifdef ONIG_DEBUG if (s->n <= 0) { - fprintf(stderr, "int_stack_pop: fail empty. %p\n", s); + fprintf(DBGFP, "int_stack_pop: fail empty. %p\n", s); return 0; } #endif @@ -228,13 +241,13 @@ ops_free(regex_t* reg) if (! is_in_string_pool(reg, op->exact_len_n.s)) xfree(op->exact_len_n.s); break; - case OP_STR_N: case OP_STR_MB2N: case OP_STR_MB3N: case OP_STR_N_IC: + case OP_STR_N: case OP_STR_MB2N: case OP_STR_MB3N: if (! is_in_string_pool(reg, op->exact_n.s)) xfree(op->exact_n.s); break; case OP_STR_1: case OP_STR_2: case OP_STR_3: case OP_STR_4: case OP_STR_5: case OP_STR_MB2N1: case OP_STR_MB2N2: - case OP_STR_MB2N3: case OP_STR_1_IC: + case OP_STR_MB2N3: break; case OP_CCLASS_NOT: case OP_CCLASS: @@ -302,9 +315,6 @@ ops_calc_size_of_string_pool(regex_t* reg) total += op->exact_len_n.len * op->exact_len_n.n; break; case OP_STR_N: - case OP_STR_N_IC: - total += op->exact_n.n; - break; case OP_STR_MB2N: total += op->exact_n.n * 2; break; @@ -357,7 +367,6 @@ ops_make_string_pool(regex_t* reg) curr += len; break; case OP_STR_N: - case OP_STR_N_IC: len = op->exact_n.n; copy: xmemcpy(curr, op->exact_n.s, len); @@ -398,12 +407,12 @@ onig_set_default_case_fold_flag(OnigCaseFoldType case_fold_flag) } static int -int_multiply_cmp(int x, int y, int v) +len_multiply_cmp(OnigLen x, int y, OnigLen v) { if (x == 0 || y == 0) return -1; - if (x < INT_MAX / y) { - int xy = x * y; + if (x < INFINITE_LEN / y) { + OnigLen xy = x * (OnigLen )y; if (xy > v) return 1; else { if (xy == v) return 0; @@ -411,7 +420,7 @@ int_multiply_cmp(int x, int y, int v) } } else - return 1; + return v == INFINITE_LEN ? 0 : 1; } extern int @@ -419,7 +428,7 @@ onig_positive_int_multiply(int x, int y) { if (x == 0 || y == 0) return 0; - if (x < INT_MAX / y) + if (x < ONIG_INT_MAX / y) return x * y; else return -1; @@ -489,42 +498,29 @@ node_str_node_cat(Node* node, Node* add) { int r; - if (STR_(node)->flag != STR_(add)->flag) + if (NODE_STATUS(node) != NODE_STATUS(add)) return ONIGERR_TYPE_BUG; - r = onig_node_str_cat(node, STR_(add)->s, STR_(add)->end); - if (r != 0) return r; - - if (NODE_STRING_IS_CASE_FOLD_MATCH(node)) - STR_(node)->case_min_len += STR_(add)->case_min_len; - - return 0; -} - -static int -node_str_cat_case_fold(Node* node, const UChar* s, const UChar* end, int case_min_len) -{ - int r; - - if (! NODE_STRING_IS_CASE_FOLD_MATCH(node)) + if (STR_(node)->flag != STR_(add)->flag) return ONIGERR_TYPE_BUG; - r = onig_node_str_cat(node, s, end); + r = onig_node_str_cat(node, STR_(add)->s, STR_(add)->end); if (r != 0) return r; - STR_(node)->case_min_len += case_min_len; return 0; } static void -node_conv_to_str_node(Node* node, int flag) +node_conv_to_str_node(Node* node, Node* ref_node) { + xmemset(node, 0, sizeof(*node)); NODE_SET_TYPE(node, NODE_STRING); - STR_(node)->flag = flag; + NODE_STATUS(node) = NODE_STATUS(ref_node); + + STR_(node)->flag = STR_(ref_node)->flag; STR_(node)->s = STR_(node)->buf; STR_(node)->end = STR_(node)->buf; STR_(node)->capacity = 0; - STR_(node)->case_min_len = 0; } static OnigLen @@ -554,7 +550,7 @@ bitset_is_empty(BitSetRef bs) { int i; - for (i = 0; i < (int )BITSET_SIZE; i++) { + for (i = 0; i < (int )BITSET_REAL_SIZE; i++) { if (bs[i] != 0) return 0; } return 1; @@ -602,6 +598,351 @@ unset_addr_list_add(UnsetAddrList* list, int offset, struct _Node* node) } #endif /* USE_CALL */ +enum CharLenReturnType { + CHAR_LEN_NORMAL = 0, /* fixed or variable */ + CHAR_LEN_TOP_ALT_FIXED = 1 +}; + +static int +mmcl_fixed(MinMaxCharLen* c) +{ + return (c->min == c->max && c->min != INFINITE_LEN); +} + +static void +mmcl_set(MinMaxCharLen* l, OnigLen len) +{ + l->min = len; + l->max = len; + l->min_is_sure = TRUE; +} + +static void +mmcl_set_min_max(MinMaxCharLen* l, OnigLen min, OnigLen max, int min_is_sure) +{ + l->min = min; + l->max = max; + l->min_is_sure = min_is_sure; +} + +static void +mmcl_add(MinMaxCharLen* to, MinMaxCharLen* add) +{ + to->min = distance_add(to->min, add->min); + to->max = distance_add(to->max, add->max); + + to->min_is_sure = add->min_is_sure != 0 && to->min_is_sure != 0; +} + +static void +mmcl_multiply(MinMaxCharLen* to, int m) +{ + to->min = distance_multiply(to->min, m); + to->max = distance_multiply(to->max, m); +} + +static void +mmcl_repeat_range_multiply(MinMaxCharLen* to, int mlow, int mhigh) +{ + to->min = distance_multiply(to->min, mlow); + + if (IS_INFINITE_REPEAT(mhigh)) + to->max = INFINITE_LEN; + else + to->max = distance_multiply(to->max, mhigh); +} + +static void +mmcl_alt_merge(MinMaxCharLen* to, MinMaxCharLen* alt) +{ + if (to->min > alt->min) { + to->min = alt->min; + if (alt->min_is_sure != 0) + to->min_is_sure = TRUE; + } + + if (to->max < alt->max) to->max = alt->max; +} + +static int +mml_is_equal(MinMaxLen* a, MinMaxLen* b) +{ + return a->min == b->min && a->max == b->max; +} + +static void +mml_set_min_max(MinMaxLen* l, OnigLen min, OnigLen max) +{ + l->min = min; + l->max = max; +} + +static void +mml_clear(MinMaxLen* l) +{ + l->min = l->max = 0; +} + +static void +mml_copy(MinMaxLen* to, MinMaxLen* from) +{ + to->min = from->min; + to->max = from->max; +} + +static void +mml_add(MinMaxLen* to, MinMaxLen* add) +{ + to->min = distance_add(to->min, add->min); + to->max = distance_add(to->max, add->max); +} + +static void +mml_alt_merge(MinMaxLen* to, MinMaxLen* alt) +{ + if (to->min > alt->min) to->min = alt->min; + if (to->max < alt->max) to->max = alt->max; +} + +/* fixed size pattern node only */ +static int +node_char_len1(Node* node, regex_t* reg, MinMaxCharLen* ci, ScanEnv* env, + int level) +{ + MinMaxCharLen tci; + int r = CHAR_LEN_NORMAL; + + level++; + + switch (NODE_TYPE(node)) { + case NODE_LIST: + { + int first = TRUE; + do { + r = node_char_len1(NODE_CAR(node), reg, &tci, env, level); + if (r < 0) break; + if (first == TRUE) { + *ci = tci; + first = FALSE; + } + else + mmcl_add(ci, &tci); + } while (IS_NOT_NULL(node = NODE_CDR(node))); + } + break; + + case NODE_ALT: + { + int fixed; + + r = node_char_len1(NODE_CAR(node), reg, ci, env, level); + if (r < 0) break; + + fixed = TRUE; + while (IS_NOT_NULL(node = NODE_CDR(node))) { + r = node_char_len1(NODE_CAR(node), reg, &tci, env, level); + if (r < 0) break; + if (! mmcl_fixed(&tci)) + fixed = FALSE; + mmcl_alt_merge(ci, &tci); + } + if (r < 0) break; + + r = CHAR_LEN_NORMAL; + if (mmcl_fixed(ci)) break; + + if (fixed == TRUE && level == 1) { + r = CHAR_LEN_TOP_ALT_FIXED; + } + } + break; + + case NODE_STRING: + { + OnigLen clen; + StrNode* sn = STR_(node); + UChar *s = sn->s; + + if (NODE_IS_IGNORECASE(node) && ! NODE_STRING_IS_CRUDE(node)) { + /* Such a case is possible. + ex. /(?i)(?<=\1)(a)/ + Backref node refer to capture group, but it doesn't tune yet. + */ + r = ONIGERR_INVALID_LOOK_BEHIND_PATTERN; + break; + } + + clen = 0; + while (s < sn->end) { + s += enclen(reg->enc, s); + clen = distance_add(clen, 1); + } + mmcl_set(ci, clen); + } + break; + + case NODE_QUANT: + { + QuantNode* qn = QUANT_(node); + + if (qn->lower == qn->upper) { + if (qn->upper == 0) { + mmcl_set(ci, 0); + } + else { + r = node_char_len1(NODE_BODY(node), reg, ci, env, level); + if (r < 0) break; + mmcl_multiply(ci, qn->lower); + } + } + else { + r = node_char_len1(NODE_BODY(node), reg, ci, env, level); + if (r < 0) break; + mmcl_repeat_range_multiply(ci, qn->lower, qn->upper); + } + } + break; + +#ifdef USE_CALL + case NODE_CALL: + if (NODE_IS_RECURSION(node)) + mmcl_set_min_max(ci, 0, INFINITE_LEN, FALSE); + else + r = node_char_len1(NODE_BODY(node), reg, ci, env, level); + break; +#endif + + case NODE_CTYPE: + case NODE_CCLASS: + mmcl_set(ci, 1); + break; + + case NODE_BAG: + { + BagNode* en = BAG_(node); + + switch (en->type) { + case BAG_MEMORY: + if (NODE_IS_FIXED_CLEN(node)) { + mmcl_set_min_max(ci, en->min_char_len, en->max_char_len, + NODE_IS_FIXED_CLEN_MIN_SURE(node)); + } + else { + if (NODE_IS_MARK1(node)) { + mmcl_set_min_max(ci, 0, INFINITE_LEN, FALSE); + } + else { + NODE_STATUS_ADD(node, MARK1); + r = node_char_len1(NODE_BODY(node), reg, ci, env, level); + NODE_STATUS_REMOVE(node, MARK1); + if (r < 0) break; + + en->min_char_len = ci->min; + en->max_char_len = ci->max; + NODE_STATUS_ADD(node, FIXED_CLEN); + if (ci->min_is_sure != 0) + NODE_STATUS_ADD(node, FIXED_CLEN_MIN_SURE); + } + } + /* can't optimize look-behind if capture exists. */ + ci->min_is_sure = FALSE; + break; + case BAG_OPTION: + case BAG_STOP_BACKTRACK: + r = node_char_len1(NODE_BODY(node), reg, ci, env, level); + break; + case BAG_IF_ELSE: + { + MinMaxCharLen eci; + + r = node_char_len1(NODE_BODY(node), reg, ci, env, level); + if (r < 0) break; + + if (IS_NOT_NULL(en->te.Then)) { + r = node_char_len1(en->te.Then, reg, &tci, env, level); + if (r < 0) break; + mmcl_add(ci, &tci); + } + + if (IS_NOT_NULL(en->te.Else)) { + r = node_char_len1(en->te.Else, reg, &eci, env, level); + if (r < 0) break; + } + else { + mmcl_set(&eci, 0); + } + + mmcl_alt_merge(ci, &eci); + } + break; + default: /* never come here */ + r = ONIGERR_PARSER_BUG; + break; + } + } + break; + + case NODE_ANCHOR: + mmcl_set(ci, 0); + /* can't optimize look-behind if anchor exists. */ + ci->min_is_sure = FALSE; + break; + + case NODE_GIMMICK: + zero: + mmcl_set(ci, 0); + break; + + case NODE_BACKREF: + if (NODE_IS_CHECKER(node)) + goto zero; + + if (NODE_IS_RECURSION(node)) { +#ifdef USE_BACKREF_WITH_LEVEL + if (NODE_IS_NEST_LEVEL(node)) { + mmcl_set_min_max(ci, 0, INFINITE_LEN, FALSE); + break; + } +#endif + + mmcl_set_min_max(ci, 0, 0, FALSE); + break; + } + + { + int i; + int* backs; + MemEnv* mem_env = SCANENV_MEMENV(env); + BackRefNode* br = BACKREF_(node); + + backs = BACKREFS_P(br); + r = node_char_len1(mem_env[backs[0]].mem_node, reg, ci, env, level); + if (r < 0) break; + if (! mmcl_fixed(ci)) ci->min_is_sure = FALSE; + + for (i = 1; i < br->back_num; i++) { + r = node_char_len1(mem_env[backs[i]].mem_node, reg, &tci, env, level); + if (r < 0) break; + if (! mmcl_fixed(&tci)) tci.min_is_sure = FALSE; + mmcl_alt_merge(ci, &tci); + } + } + break; + + default: /* never come here */ + r = ONIGERR_PARSER_BUG; + break; + } + + return r; +} + +static int +node_char_len(Node* node, regex_t* reg, MinMaxCharLen* ci, ScanEnv* env) +{ + return node_char_len1(node, reg, ci, env, 0); +} + static int add_op(regex_t* reg, int opcode) @@ -626,7 +967,7 @@ static int compile_tree(Node* node, regex_t* reg, ScanEnv* env); #define IS_NEED_STR_LEN_OP(op) \ ((op) == OP_STR_N || (op) == OP_STR_MB2N ||\ - (op) == OP_STR_MB3N || (op) == OP_STR_MBN || (op) == OP_STR_N_IC) + (op) == OP_STR_MB3N || (op) == OP_STR_MBN) static int select_str_opcode(int mb_len, int str_len) @@ -711,16 +1052,16 @@ compile_quant_body_with_empty_check(QuantNode* qn, regex_t* reg, ScanEnv* env) if (r != 0) return r; if (emptiness != BODY_IS_NOT_EMPTY) { - if (emptiness == BODY_IS_EMPTY_POSSIBILITY) + if (emptiness == BODY_MAY_BE_EMPTY) r = add_op(reg, OP_EMPTY_CHECK_END); - else if (emptiness == BODY_IS_EMPTY_POSSIBILITY_MEM) { + else if (emptiness == BODY_MAY_BE_EMPTY_MEM) { if (NODE_IS_EMPTY_STATUS_CHECK(qn) != 0) r = add_op(reg, OP_EMPTY_CHECK_END_MEMST); else r = add_op(reg, OP_EMPTY_CHECK_END); } #ifdef USE_CALL - else if (emptiness == BODY_IS_EMPTY_POSSIBILITY_REC) + else if (emptiness == BODY_MAY_BE_EMPTY_REC) r = add_op(reg, OP_EMPTY_CHECK_END_MEMST_PUSH); #endif @@ -794,12 +1135,7 @@ add_compile_string(UChar* s, int mb_len, int str_len, regex_t* reg) else if (IS_NEED_STR_LEN_OP(op)) { p = onigenc_strdup(reg->enc, s, end); CHECK_NULL_RETURN_MEMERR(p); - - if (op == OP_STR_N_IC) - COP(reg)->exact_n.n = byte_len; - else - COP(reg)->exact_n.n = str_len; - + COP(reg)->exact_n.n = str_len; COP(reg)->exact_n.s = p; } else { @@ -822,8 +1158,6 @@ compile_length_string_node(Node* node, regex_t* reg) if (sn->end <= sn->s) return 0; - if (NODE_STRING_IS_CASE_FOLD_MATCH(node) != 0) return 1; - p = prev = sn->s; prev_len = enclen(enc, p); p += prev_len; @@ -861,40 +1195,6 @@ compile_length_string_crude_node(StrNode* sn, regex_t* reg) } static int -compile_ambig_string_node(Node* node, regex_t* reg) -{ - int r; - int len; - int byte_len; - UChar* p; - StrNode* sn; - OnigEncoding enc = reg->enc; - - sn = STR_(node); - len = enclen(enc, sn->s); - byte_len = (int )(sn->end - sn->s); - if (len == byte_len) { - r = add_op(reg, OP_STR_1_IC); - if (r != 0) return r; - - xmemset(COP(reg)->exact.s, 0, sizeof(COP(reg)->exact.s)); - xmemcpy(COP(reg)->exact.s, sn->s, (size_t )byte_len); - } - else { - r = add_op(reg, OP_STR_N_IC); - if (r != 0) return r; - - p = onigenc_strdup(enc, sn->s, sn->end); - CHECK_NULL_RETURN_MEMERR(p); - - COP(reg)->exact_n.s = p; - COP(reg)->exact_n.n = byte_len; - } - - return 0; -} - -static int compile_string_node(Node* node, regex_t* reg) { int r, len, prev_len, slen; @@ -907,9 +1207,6 @@ compile_string_node(Node* node, regex_t* reg) return 0; end = sn->end; - if (NODE_STRING_IS_CASE_FOLD_MATCH(node) != 0) { - return compile_ambig_string_node(node, reg); - } p = prev = sn->s; prev_len = enclen(enc, p); @@ -1103,7 +1400,7 @@ compile_length_quantifier_node(QuantNode* qn, regex_t* reg) /* anychar repeat */ if (is_anychar_infinite_greedy(qn)) { if (qn->lower <= 1 || - int_multiply_cmp(tlen, qn->lower, QUANTIFIER_EXPAND_LIMIT_SIZE) <= 0) { + len_multiply_cmp((OnigLen )tlen, qn->lower, QUANTIFIER_EXPAND_LIMIT_SIZE) <= 0) { if (IS_NOT_NULL(qn->next_head_exact)) return OPSIZE_ANYCHAR_STAR_PEEK_NEXT + tlen * qn->lower; else @@ -1117,7 +1414,7 @@ compile_length_quantifier_node(QuantNode* qn, regex_t* reg) if (infinite && (qn->lower <= 1 || - int_multiply_cmp(tlen, qn->lower, QUANTIFIER_EXPAND_LIMIT_SIZE) <= 0)) { + len_multiply_cmp(tlen, qn->lower, QUANTIFIER_EXPAND_LIMIT_SIZE) <= 0)) { if (qn->lower == 1 && tlen > QUANTIFIER_EXPAND_LIMIT_SIZE) { len = OPSIZE_JUMP; } @@ -1148,7 +1445,7 @@ compile_length_quantifier_node(QuantNode* qn, regex_t* reg) } else if (!infinite && qn->greedy && (qn->upper == 1 || - int_multiply_cmp(tlen + OPSIZE_PUSH, qn->upper, + len_multiply_cmp((OnigLen )tlen + OPSIZE_PUSH, qn->upper, QUANTIFIER_EXPAND_LIMIT_SIZE) <= 0)) { len = tlen * qn->lower; len += (OPSIZE_PUSH + tlen) * (qn->upper - qn->lower); @@ -1176,12 +1473,12 @@ compile_quantifier_node(QuantNode* qn, regex_t* reg, ScanEnv* env) if (is_anychar_infinite_greedy(qn) && (qn->lower <= 1 || - int_multiply_cmp(tlen, qn->lower, QUANTIFIER_EXPAND_LIMIT_SIZE) <= 0)) { + len_multiply_cmp((OnigLen )tlen, qn->lower, + QUANTIFIER_EXPAND_LIMIT_SIZE) <= 0)) { r = compile_tree_n_times(NODE_QUANT_BODY(qn), qn->lower, reg, env); if (r != 0) return r; if (IS_NOT_NULL(qn->next_head_exact)) { - r = add_op(reg, - IS_MULTILINE(CTYPE_OPTION(NODE_QUANT_BODY(qn), reg)) ? + r = add_op(reg, NODE_IS_MULTILINE(NODE_QUANT_BODY(qn)) ? OP_ANYCHAR_ML_STAR_PEEK_NEXT : OP_ANYCHAR_STAR_PEEK_NEXT); if (r != 0) return r; @@ -1189,8 +1486,7 @@ compile_quantifier_node(QuantNode* qn, regex_t* reg, ScanEnv* env) return 0; } else { - r = add_op(reg, - IS_MULTILINE(CTYPE_OPTION(NODE_QUANT_BODY(qn), reg)) ? + r = add_op(reg, NODE_IS_MULTILINE(NODE_QUANT_BODY(qn)) ? OP_ANYCHAR_ML_STAR : OP_ANYCHAR_STAR); return r; } @@ -1202,7 +1498,8 @@ compile_quantifier_node(QuantNode* qn, regex_t* reg, ScanEnv* env) if (infinite && (qn->lower <= 1 || - int_multiply_cmp(tlen, qn->lower, QUANTIFIER_EXPAND_LIMIT_SIZE) <= 0)) { + len_multiply_cmp((OnigLen )tlen, qn->lower, + QUANTIFIER_EXPAND_LIMIT_SIZE) <= 0)) { int addr; if (qn->lower == 1 && tlen > QUANTIFIER_EXPAND_LIMIT_SIZE) { @@ -1297,7 +1594,7 @@ compile_quantifier_node(QuantNode* qn, regex_t* reg, ScanEnv* env) } else if (! infinite && qn->greedy && (qn->upper == 1 || - int_multiply_cmp(tlen + OPSIZE_PUSH, qn->upper, + len_multiply_cmp((OnigLen )tlen + OPSIZE_PUSH, qn->upper, QUANTIFIER_EXPAND_LIMIT_SIZE) <= 0)) { int n = qn->upper - qn->lower; @@ -1337,11 +1634,8 @@ static int compile_length_option_node(BagNode* node, regex_t* reg) { int tlen; - OnigOptionType prev = reg->options; - reg->options = node->o.options; tlen = compile_length_tree(NODE_BAG_BODY(node), reg); - reg->options = prev; return tlen; } @@ -1350,11 +1644,8 @@ static int compile_option_node(BagNode* node, regex_t* reg, ScanEnv* env) { int r; - OnigOptionType prev = reg->options; - reg->options = node->o.options; r = compile_tree(NODE_BAG_BODY(node), reg, env); - reg->options = prev; return r; } @@ -1423,10 +1714,10 @@ compile_length_bag_node(BagNode* node, regex_t* reg) v = onig_positive_int_multiply(qn->lower, tlen); if (v < 0) return ONIGERR_TOO_BIG_NUMBER_FOR_REPEAT_RANGE; - len = v + OPSIZE_PUSH + tlen + OPSIZE_POP_OUT + OPSIZE_JUMP; + len = v + OPSIZE_PUSH + tlen + OPSIZE_POP + OPSIZE_JUMP; } else { - len = OPSIZE_ATOMIC_START + tlen + OPSIZE_ATOMIC_END; + len = OPSIZE_MARK + tlen + OPSIZE_CUT_TO_MARK; } break; @@ -1438,8 +1729,7 @@ compile_length_bag_node(BagNode* node, regex_t* reg) len = compile_length_tree(cond, reg); if (len < 0) return len; - len += OPSIZE_PUSH; - len += OPSIZE_ATOMIC_START + OPSIZE_ATOMIC_END; + len += OPSIZE_PUSH + OPSIZE_MARK + OPSIZE_CUT_TO_MARK; if (IS_NOT_NULL(Then)) { tlen = compile_length_tree(Then, reg); @@ -1447,7 +1737,7 @@ compile_length_bag_node(BagNode* node, regex_t* reg) len += tlen; } - len += OPSIZE_JUMP + OPSIZE_ATOMIC_END; + len += OPSIZE_JUMP + OPSIZE_CUT_TO_MARK; if (IS_NOT_NULL(Else)) { tlen = compile_length_tree(Else, reg); @@ -1466,8 +1756,6 @@ compile_length_bag_node(BagNode* node, regex_t* reg) return len; } -static int get_char_len_node(Node* node, regex_t* reg, int* len); - static int compile_bag_memory_node(BagNode* node, regex_t* reg, ScanEnv* env) { @@ -1481,7 +1769,7 @@ compile_bag_memory_node(BagNode* node, regex_t* reg, ScanEnv* env) if (r != 0) return r; node->m.called_addr = COP_CURR_OFFSET(reg) + 1 + OPSIZE_JUMP; - NODE_STATUS_ADD(node, ADDR_FIXED); + NODE_STATUS_ADD(node, FIXED_ADDR); COP(reg)->call.addr = (int )node->m.called_addr; if (node->m.regnum == 0) { @@ -1574,35 +1862,49 @@ compile_bag_node(BagNode* node, regex_t* reg, ScanEnv* env) r = add_op(reg, OP_PUSH); if (r != 0) return r; - COP(reg)->push.addr = SIZE_INC + len + OPSIZE_POP_OUT + OPSIZE_JUMP; + COP(reg)->push.addr = SIZE_INC + len + OPSIZE_POP + OPSIZE_JUMP; r = compile_tree(NODE_QUANT_BODY(qn), reg, env); if (r != 0) return r; - r = add_op(reg, OP_POP_OUT); + r = add_op(reg, OP_POP); if (r != 0) return r; r = add_op(reg, OP_JUMP); if (r != 0) return r; - COP(reg)->jump.addr = -((int )OPSIZE_PUSH + len + (int )OPSIZE_POP_OUT); + COP(reg)->jump.addr = -((int )OPSIZE_PUSH + len + (int )OPSIZE_POP); } else { - r = add_op(reg, OP_ATOMIC_START); + MemNumType mid; + + ID_ENTRY(env, mid); + r = add_op(reg, OP_MARK); if (r != 0) return r; + COP(reg)->mark.id = mid; + COP(reg)->mark.save_pos = 0; + r = compile_tree(NODE_BAG_BODY(node), reg, env); if (r != 0) return r; - r = add_op(reg, OP_ATOMIC_END); + r = add_op(reg, OP_CUT_TO_MARK); + if (r != 0) return r; + COP(reg)->cut_to_mark.id = mid; + COP(reg)->cut_to_mark.restore_pos = 0; } break; case BAG_IF_ELSE: { int cond_len, then_len, else_len, jump_len; + MemNumType mid; Node* cond = NODE_BAG_BODY(node); Node* Then = node->te.Then; Node* Else = node->te.Else; - r = add_op(reg, OP_ATOMIC_START); + ID_ENTRY(env, mid); + + r = add_op(reg, OP_MARK); if (r != 0) return r; + COP(reg)->mark.id = mid; + COP(reg)->mark.save_pos = 0; cond_len = compile_length_tree(cond, reg); if (cond_len < 0) return cond_len; @@ -1613,7 +1915,7 @@ compile_bag_node(BagNode* node, regex_t* reg, ScanEnv* env) else then_len = 0; - jump_len = cond_len + then_len + OPSIZE_ATOMIC_END + OPSIZE_JUMP; + jump_len = cond_len + then_len + OPSIZE_CUT_TO_MARK + OPSIZE_JUMP; r = add_op(reg, OP_PUSH); if (r != 0) return r; @@ -1621,8 +1923,10 @@ compile_bag_node(BagNode* node, regex_t* reg, ScanEnv* env) r = compile_tree(cond, reg, env); if (r != 0) return r; - r = add_op(reg, OP_ATOMIC_END); + r = add_op(reg, OP_CUT_TO_MARK); if (r != 0) return r; + COP(reg)->cut_to_mark.id = mid; + COP(reg)->cut_to_mark.restore_pos = 0; if (IS_NOT_NULL(Then)) { r = compile_tree(Then, reg, env); @@ -1638,10 +1942,12 @@ compile_bag_node(BagNode* node, regex_t* reg, ScanEnv* env) r = add_op(reg, OP_JUMP); if (r != 0) return r; - COP(reg)->jump.addr = OPSIZE_ATOMIC_END + else_len + SIZE_INC; + COP(reg)->jump.addr = OPSIZE_CUT_TO_MARK + else_len + SIZE_INC; - r = add_op(reg, OP_ATOMIC_END); + r = add_op(reg, OP_CUT_TO_MARK); if (r != 0) return r; + COP(reg)->cut_to_mark.id = mid; + COP(reg)->cut_to_mark.restore_pos = 0; if (IS_NOT_NULL(Else)) { r = compile_tree(Else, reg, env); @@ -1666,16 +1972,38 @@ compile_length_anchor_node(AnchorNode* node, regex_t* reg) switch (node->type) { case ANCR_PREC_READ: - len = OPSIZE_PREC_READ_START + tlen + OPSIZE_PREC_READ_END; + len = OPSIZE_MARK + tlen + OPSIZE_CUT_TO_MARK; break; case ANCR_PREC_READ_NOT: - len = OPSIZE_PREC_READ_NOT_START + tlen + OPSIZE_PREC_READ_NOT_END; + len = OPSIZE_PUSH + OPSIZE_MARK + tlen + OPSIZE_POP_TO_MARK + OPSIZE_POP + OPSIZE_FAIL; break; case ANCR_LOOK_BEHIND: - len = OPSIZE_LOOK_BEHIND + tlen; + if (node->char_min_len == node->char_max_len) + len = OPSIZE_MARK + OPSIZE_STEP_BACK_START + tlen + OPSIZE_CUT_TO_MARK; + else { + len = OPSIZE_SAVE_VAL + OPSIZE_UPDATE_VAR + OPSIZE_MARK + OPSIZE_PUSH + OPSIZE_UPDATE_VAR + OPSIZE_FAIL + OPSIZE_JUMP + OPSIZE_STEP_BACK_START + OPSIZE_STEP_BACK_NEXT + tlen + OPSIZE_CHECK_POSITION + OPSIZE_CUT_TO_MARK + OPSIZE_UPDATE_VAR; + + if (IS_NOT_NULL(node->lead_node)) { + int llen = compile_length_tree(node->lead_node, reg); + if (llen < 0) return llen; + + len += OPSIZE_MOVE + llen; + } + } break; case ANCR_LOOK_BEHIND_NOT: - len = OPSIZE_LOOK_BEHIND_NOT_START + tlen + OPSIZE_LOOK_BEHIND_NOT_END; + if (node->char_min_len == node->char_max_len) + len = OPSIZE_MARK + OPSIZE_PUSH + OPSIZE_STEP_BACK_START + tlen + OPSIZE_POP_TO_MARK + OPSIZE_FAIL + OPSIZE_POP; + else { + len = OPSIZE_SAVE_VAL + OPSIZE_UPDATE_VAR + OPSIZE_MARK + OPSIZE_PUSH + OPSIZE_STEP_BACK_START + OPSIZE_STEP_BACK_NEXT + tlen + OPSIZE_CHECK_POSITION + OPSIZE_POP_TO_MARK + OPSIZE_UPDATE_VAR + OPSIZE_POP + OPSIZE_FAIL + OPSIZE_UPDATE_VAR + OPSIZE_POP + OPSIZE_POP; + + if (IS_NOT_NULL(node->lead_node)) { + int llen = compile_length_tree(node->lead_node, reg); + if (llen < 0) return llen; + + len += OPSIZE_MOVE + llen; + } + } break; case ANCR_WORD_BOUNDARY: @@ -1701,10 +2029,254 @@ compile_length_anchor_node(AnchorNode* node, regex_t* reg) } static int +compile_anchor_look_behind_node(AnchorNode* node, regex_t* reg, ScanEnv* env) +{ + int r; + + if (node->char_min_len == node->char_max_len) { + MemNumType mid; + + ID_ENTRY(env, mid); + r = add_op(reg, OP_MARK); + if (r != 0) return r; + COP(reg)->mark.id = mid; + COP(reg)->mark.save_pos = FALSE; + + r = add_op(reg, OP_STEP_BACK_START); + if (r != 0) return r; + COP(reg)->step_back_start.initial = node->char_min_len; + COP(reg)->step_back_start.remaining = 0; + COP(reg)->step_back_start.addr = 1; + + r = compile_tree(NODE_ANCHOR_BODY(node), reg, env); + if (r != 0) return r; + + r = add_op(reg, OP_CUT_TO_MARK); + if (r != 0) return r; + COP(reg)->cut_to_mark.id = mid; + COP(reg)->cut_to_mark.restore_pos = FALSE; + } + else { + MemNumType mid1, mid2; + OnigLen diff; + + if (IS_NOT_NULL(node->lead_node)) { + MinMaxCharLen ci; + + r = node_char_len(node->lead_node, reg, &ci, env); + if (r < 0) return r; + r = add_op(reg, OP_MOVE); + if (r != 0) return r; + COP(reg)->move.n = -((RelPositionType )ci.min); + r = compile_tree(node->lead_node, reg, env); + if (r != 0) return r; + } + + ID_ENTRY(env, mid1); + r = add_op(reg, OP_SAVE_VAL); + if (r != 0) return r; + COP(reg)->save_val.type = SAVE_RIGHT_RANGE; + COP(reg)->save_val.id = mid1; + + r = add_op(reg, OP_UPDATE_VAR); + if (r != 0) return r; + COP(reg)->update_var.type = UPDATE_VAR_RIGHT_RANGE_TO_S; + + ID_ENTRY(env, mid2); + r = add_op(reg, OP_MARK); + if (r != 0) return r; + COP(reg)->mark.id = mid2; + COP(reg)->mark.save_pos = FALSE; + + r = add_op(reg, OP_PUSH); + if (r != 0) return r; + COP(reg)->push.addr = SIZE_INC + OPSIZE_JUMP; + + r = add_op(reg, OP_JUMP); + if (r != 0) return r; + COP(reg)->jump.addr = SIZE_INC + OPSIZE_UPDATE_VAR + OPSIZE_FAIL; + + r = add_op(reg, OP_UPDATE_VAR); + if (r != 0) return r; + COP(reg)->update_var.type = UPDATE_VAR_RIGHT_RANGE_FROM_STACK; + COP(reg)->update_var.id = mid1; + COP(reg)->update_var.clear = FALSE; + r = add_op(reg, OP_FAIL); + if (r != 0) return r; + + r = add_op(reg, OP_STEP_BACK_START); + if (r != 0) return r; + + if (node->char_max_len != INFINITE_LEN) + diff = node->char_max_len - node->char_min_len; + else + diff = INFINITE_LEN; + + COP(reg)->step_back_start.initial = node->char_min_len; + COP(reg)->step_back_start.remaining = diff; + COP(reg)->step_back_start.addr = 2; + + r = add_op(reg, OP_STEP_BACK_NEXT); + if (r != 0) return r; + + r = compile_tree(NODE_ANCHOR_BODY(node), reg, env); + if (r != 0) return r; + + r = add_op(reg, OP_CHECK_POSITION); + if (r != 0) return r; + COP(reg)->check_position.type = CHECK_POSITION_CURRENT_RIGHT_RANGE; + + r = add_op(reg, OP_CUT_TO_MARK); + if (r != 0) return r; + COP(reg)->cut_to_mark.id = mid2; + COP(reg)->cut_to_mark.restore_pos = FALSE; + + r = add_op(reg, OP_UPDATE_VAR); + if (r != 0) return r; + COP(reg)->update_var.type = UPDATE_VAR_RIGHT_RANGE_FROM_STACK; + COP(reg)->update_var.id = mid1; + COP(reg)->update_var.clear = TRUE; + } + + return r; +} + +static int +compile_anchor_look_behind_not_node(AnchorNode* node, regex_t* reg, + ScanEnv* env) +{ + int r; + int len; + + len = compile_length_tree(NODE_ANCHOR_BODY(node), reg); + + if (node->char_min_len == node->char_max_len) { + MemNumType mid; + + ID_ENTRY(env, mid); + r = add_op(reg, OP_MARK); + if (r != 0) return r; + COP(reg)->mark.id = mid; + COP(reg)->mark.save_pos = FALSE; + + r = add_op(reg, OP_PUSH); + if (r != 0) return r; + COP(reg)->push.addr = SIZE_INC + OPSIZE_STEP_BACK_START + len + OPSIZE_POP_TO_MARK + OPSIZE_FAIL; + + r = add_op(reg, OP_STEP_BACK_START); + if (r != 0) return r; + COP(reg)->step_back_start.initial = node->char_min_len; + COP(reg)->step_back_start.remaining = 0; + COP(reg)->step_back_start.addr = 1; + + r = compile_tree(NODE_ANCHOR_BODY(node), reg, env); + if (r != 0) return r; + + r = add_op(reg, OP_POP_TO_MARK); + if (r != 0) return r; + COP(reg)->pop_to_mark.id = mid; + r = add_op(reg, OP_FAIL); + if (r != 0) return r; + r = add_op(reg, OP_POP); + } + else { + MemNumType mid1, mid2; + OnigLen diff; + + ID_ENTRY(env, mid1); + r = add_op(reg, OP_SAVE_VAL); + if (r != 0) return r; + COP(reg)->save_val.type = SAVE_RIGHT_RANGE; + COP(reg)->save_val.id = mid1; + + r = add_op(reg, OP_UPDATE_VAR); + if (r != 0) return r; + COP(reg)->update_var.type = UPDATE_VAR_RIGHT_RANGE_TO_S; + + ID_ENTRY(env, mid2); + r = add_op(reg, OP_MARK); + if (r != 0) return r; + COP(reg)->mark.id = mid2; + COP(reg)->mark.save_pos = FALSE; + + r = add_op(reg, OP_PUSH); + if (r != 0) return r; + COP(reg)->push.addr = SIZE_INC + OPSIZE_STEP_BACK_START + OPSIZE_STEP_BACK_NEXT + len + OPSIZE_CHECK_POSITION + OPSIZE_POP_TO_MARK + OPSIZE_UPDATE_VAR + OPSIZE_POP + OPSIZE_FAIL; + + if (IS_NOT_NULL(node->lead_node)) { + int clen; + MinMaxCharLen ci; + + clen = compile_length_tree(node->lead_node, reg); + COP(reg)->push.addr += OPSIZE_MOVE + clen; + + r = node_char_len(node->lead_node, reg, &ci, env); + if (r < 0) return r; + r = add_op(reg, OP_MOVE); + if (r != 0) return r; + COP(reg)->move.n = -((RelPositionType )ci.min); + + r = compile_tree(node->lead_node, reg, env); + if (r != 0) return r; + } + + r = add_op(reg, OP_STEP_BACK_START); + if (r != 0) return r; + + if (node->char_max_len != INFINITE_LEN) + diff = node->char_max_len - node->char_min_len; + else + diff = INFINITE_LEN; + + COP(reg)->step_back_start.initial = node->char_min_len; + COP(reg)->step_back_start.remaining = diff; + COP(reg)->step_back_start.addr = 2; + + r = add_op(reg, OP_STEP_BACK_NEXT); + if (r != 0) return r; + + r = compile_tree(NODE_ANCHOR_BODY(node), reg, env); + if (r != 0) return r; + + r = add_op(reg, OP_CHECK_POSITION); + if (r != 0) return r; + COP(reg)->check_position.type = CHECK_POSITION_CURRENT_RIGHT_RANGE; + + r = add_op(reg, OP_POP_TO_MARK); + if (r != 0) return r; + COP(reg)->pop_to_mark.id = mid2; + + r = add_op(reg, OP_UPDATE_VAR); + if (r != 0) return r; + COP(reg)->update_var.type = UPDATE_VAR_RIGHT_RANGE_FROM_STACK; + COP(reg)->update_var.id = mid1; + COP(reg)->update_var.clear = FALSE; + + r = add_op(reg, OP_POP); /* pop save val */ + if (r != 0) return r; + r = add_op(reg, OP_FAIL); + if (r != 0) return r; + + r = add_op(reg, OP_UPDATE_VAR); + if (r != 0) return r; + COP(reg)->update_var.type = UPDATE_VAR_RIGHT_RANGE_FROM_STACK; + COP(reg)->update_var.id = mid1; + COP(reg)->update_var.clear = FALSE; + + r = add_op(reg, OP_POP); /* pop mark */ + if (r != 0) return r; + r = add_op(reg, OP_POP); /* pop save val */ + } + + return r; +} + +static int compile_anchor_node(AnchorNode* node, regex_t* reg, ScanEnv* env) { int r, len; enum OpCode op; + MemNumType mid; switch (node->type) { case ANCR_BEGIN_BUF: r = add_op(reg, OP_BEGIN_BUF); break; @@ -1712,7 +2284,11 @@ compile_anchor_node(AnchorNode* node, regex_t* reg, ScanEnv* env) case ANCR_BEGIN_LINE: r = add_op(reg, OP_BEGIN_LINE); break; case ANCR_END_LINE: r = add_op(reg, OP_END_LINE); break; case ANCR_SEMI_END_BUF: r = add_op(reg, OP_SEMI_END_BUF); break; - case ANCR_BEGIN_POSITION: r = add_op(reg, OP_BEGIN_POSITION); break; + case ANCR_BEGIN_POSITION: + r = add_op(reg, OP_CHECK_POSITION); + if (r != 0) return r; + COP(reg)->check_position.type = CHECK_POSITION_SEARCH_START; + break; case ANCR_WORD_BOUNDARY: op = OP_WORD_BOUNDARY; @@ -1744,7 +2320,7 @@ compile_anchor_node(AnchorNode* node, regex_t* reg, ScanEnv* env) type = EXTENDED_GRAPHEME_CLUSTER_BOUNDARY; #ifdef USE_UNICODE_WORD_BREAK - if (ONIG_IS_OPTION_ON(reg->options, ONIG_OPTION_TEXT_SEGMENT_WORD)) + if (NODE_IS_TEXT_SEGMENT_WORD(node)) type = WORD_BOUNDARY; #endif @@ -1755,66 +2331,60 @@ compile_anchor_node(AnchorNode* node, regex_t* reg, ScanEnv* env) break; case ANCR_PREC_READ: - r = add_op(reg, OP_PREC_READ_START); - if (r != 0) return r; - r = compile_tree(NODE_ANCHOR_BODY(node), reg, env); - if (r != 0) return r; - r = add_op(reg, OP_PREC_READ_END); - break; - - case ANCR_PREC_READ_NOT: - len = compile_length_tree(NODE_ANCHOR_BODY(node), reg); - if (len < 0) return len; - - r = add_op(reg, OP_PREC_READ_NOT_START); - if (r != 0) return r; - COP(reg)->prec_read_not_start.addr = SIZE_INC + len + OPSIZE_PREC_READ_NOT_END; - r = compile_tree(NODE_ANCHOR_BODY(node), reg, env); - if (r != 0) return r; - r = add_op(reg, OP_PREC_READ_NOT_END); - break; - - case ANCR_LOOK_BEHIND: { - int n; - r = add_op(reg, OP_LOOK_BEHIND); + ID_ENTRY(env, mid); + r = add_op(reg, OP_MARK); if (r != 0) return r; - if (node->char_len < 0) { - r = get_char_len_node(NODE_ANCHOR_BODY(node), reg, &n); - if (r != 0) return ONIGERR_INVALID_LOOK_BEHIND_PATTERN; - } - else - n = node->char_len; + COP(reg)->mark.id = mid; + COP(reg)->mark.save_pos = TRUE; - COP(reg)->look_behind.len = n; r = compile_tree(NODE_ANCHOR_BODY(node), reg, env); + if (r != 0) return r; + + r = add_op(reg, OP_CUT_TO_MARK); + if (r != 0) return r; + COP(reg)->cut_to_mark.id = mid; + COP(reg)->cut_to_mark.restore_pos = TRUE; } break; - case ANCR_LOOK_BEHIND_NOT: + case ANCR_PREC_READ_NOT: { - int n; - len = compile_length_tree(NODE_ANCHOR_BODY(node), reg); - r = add_op(reg, OP_LOOK_BEHIND_NOT_START); - if (r != 0) return r; - COP(reg)->look_behind_not_start.addr = SIZE_INC + len + OPSIZE_LOOK_BEHIND_NOT_END; + if (len < 0) return len; - if (node->char_len < 0) { - r = get_char_len_node(NODE_ANCHOR_BODY(node), reg, &n); - if (r != 0) return ONIGERR_INVALID_LOOK_BEHIND_PATTERN; - } - else - n = node->char_len; + ID_ENTRY(env, mid); + r = add_op(reg, OP_PUSH); + if (r != 0) return r; + COP(reg)->push.addr = SIZE_INC + OPSIZE_MARK + len + + OPSIZE_POP_TO_MARK + OPSIZE_POP + OPSIZE_FAIL; - COP(reg)->look_behind_not_start.len = n; + r = add_op(reg, OP_MARK); + if (r != 0) return r; + COP(reg)->mark.id = mid; + COP(reg)->mark.save_pos = FALSE; r = compile_tree(NODE_ANCHOR_BODY(node), reg, env); if (r != 0) return r; - r = add_op(reg, OP_LOOK_BEHIND_NOT_END); + + r = add_op(reg, OP_POP_TO_MARK); + if (r != 0) return r; + COP(reg)->pop_to_mark.id = mid; + + r = add_op(reg, OP_POP); + if (r != 0) return r; + r = add_op(reg, OP_FAIL); } break; + case ANCR_LOOK_BEHIND: + r = compile_anchor_look_behind_node(node, reg, env); + break; + + case ANCR_LOOK_BEHIND_NOT: + r = compile_anchor_look_behind_not_node(node, reg, env); + break; + default: return ONIGERR_TYPE_BUG; break; @@ -1826,7 +2396,7 @@ compile_anchor_node(AnchorNode* node, regex_t* reg, ScanEnv* env) static int compile_gimmick_node(GimmickNode* node, regex_t* reg) { - int r; + int r = 0; switch (node->type) { case GIMMICK_FAIL: @@ -1834,10 +2404,10 @@ compile_gimmick_node(GimmickNode* node, regex_t* reg) break; case GIMMICK_SAVE: - r = add_op(reg, OP_PUSH_SAVE_VAL); + r = add_op(reg, OP_SAVE_VAL); if (r != 0) return r; - COP(reg)->push_save_val.type = node->detail_type; - COP(reg)->push_save_val.id = node->id; + COP(reg)->save_val.type = node->detail_type; + COP(reg)->save_val.id = node->id; break; case GIMMICK_UPDATE_VAR: @@ -1845,6 +2415,7 @@ compile_gimmick_node(GimmickNode* node, regex_t* reg) if (r != 0) return r; COP(reg)->update_var.type = node->detail_type; COP(reg)->update_var.id = node->id; + COP(reg)->update_var.clear = FALSE; break; #ifdef USE_CALLOUT @@ -1888,7 +2459,7 @@ compile_length_gimmick_node(GimmickNode* node, regex_t* reg) break; case GIMMICK_SAVE: - len = OPSIZE_PUSH_SAVE_VAL; + len = OPSIZE_SAVE_VAL; break; case GIMMICK_UPDATE_VAR: @@ -2055,8 +2626,7 @@ compile_tree(Node* node, regex_t* reg, ScanEnv* env) switch (CTYPE_(node)->ctype) { case CTYPE_ANYCHAR: - r = add_op(reg, IS_MULTILINE(CTYPE_OPTION(node, reg)) ? - OP_ANYCHAR_ML : OP_ANYCHAR); + r = add_op(reg, NODE_IS_MULTILINE(node) ? OP_ANYCHAR_ML : OP_ANYCHAR); break; case ONIGENC_CTYPE_WORD: @@ -2098,7 +2668,7 @@ compile_tree(Node* node, regex_t* reg, ScanEnv* env) else { #ifdef USE_BACKREF_WITH_LEVEL if (NODE_IS_NEST_LEVEL(node)) { - if ((reg->options & ONIG_OPTION_IGNORECASE) != 0) + if (NODE_IS_IGNORECASE(node)) r = add_op(reg, OP_BACKREF_WITH_LEVEL_IC); else r = add_op(reg, OP_BACKREF_WITH_LEVEL); @@ -2111,7 +2681,7 @@ compile_tree(Node* node, regex_t* reg, ScanEnv* env) #endif if (br->back_num == 1) { n = br->back_static[0]; - if (IS_IGNORECASE(reg->options)) { + if (NODE_IS_IGNORECASE(node)) { r = add_op(reg, OP_BACKREF_N_IC); if (r != 0) return r; COP(reg)->backref_n.n1 = n; @@ -2132,7 +2702,7 @@ compile_tree(Node* node, regex_t* reg, ScanEnv* env) int num; int* p; - r = add_op(reg, IS_IGNORECASE(reg->options) ? + r = add_op(reg, NODE_IS_IGNORECASE(node) ? OP_BACKREF_MULTI_IC : OP_BACKREF_MULTI); if (r != 0) return r; @@ -2183,7 +2753,7 @@ compile_tree(Node* node, regex_t* reg, ScanEnv* env) default: #ifdef ONIG_DEBUG - fprintf(stderr, "compile_tree: undefined node type %d\n", NODE_TYPE(node)); + fprintf(DBGFP, "compile_tree: undefined node type %d\n", NODE_TYPE(node)); #endif break; } @@ -2192,7 +2762,7 @@ compile_tree(Node* node, regex_t* reg, ScanEnv* env) } static int -noname_disable_map(Node** plink, GroupNumRemap* map, int* counter) +make_named_capture_number_map(Node** plink, GroupNumMap* map, int* counter) { int r = 0; Node* node = *plink; @@ -2201,7 +2771,7 @@ noname_disable_map(Node** plink, GroupNumRemap* map, int* counter) case NODE_LIST: case NODE_ALT: do { - r = noname_disable_map(&(NODE_CAR(node)), map, counter); + r = make_named_capture_number_map(&(NODE_CAR(node)), map, counter); } while (r == 0 && IS_NOT_NULL(node = NODE_CDR(node))); break; @@ -2209,7 +2779,7 @@ noname_disable_map(Node** plink, GroupNumRemap* map, int* counter) { Node** ptarget = &(NODE_BODY(node)); Node* old = *ptarget; - r = noname_disable_map(ptarget, map, counter); + r = make_named_capture_number_map(ptarget, map, counter); if (r != 0) return r; if (*ptarget != old && NODE_TYPE(*ptarget) == NODE_QUANT) { r = onig_reduce_nested_quantifier(node); @@ -2225,35 +2795,35 @@ noname_disable_map(Node** plink, GroupNumRemap* map, int* counter) (*counter)++; map[en->m.regnum].new_val = *counter; en->m.regnum = *counter; - r = noname_disable_map(&(NODE_BODY(node)), map, counter); + r = make_named_capture_number_map(&(NODE_BODY(node)), map, counter); } else { *plink = NODE_BODY(node); NODE_BODY(node) = NULL_NODE; onig_node_free(node); - r = noname_disable_map(plink, map, counter); + r = make_named_capture_number_map(plink, map, counter); } } else if (en->type == BAG_IF_ELSE) { - r = noname_disable_map(&(NODE_BAG_BODY(en)), map, counter); + r = make_named_capture_number_map(&(NODE_BAG_BODY(en)), map, counter); if (r != 0) return r; if (IS_NOT_NULL(en->te.Then)) { - r = noname_disable_map(&(en->te.Then), map, counter); + r = make_named_capture_number_map(&(en->te.Then), map, counter); if (r != 0) return r; } if (IS_NOT_NULL(en->te.Else)) { - r = noname_disable_map(&(en->te.Else), map, counter); + r = make_named_capture_number_map(&(en->te.Else), map, counter); if (r != 0) return r; } } else - r = noname_disable_map(&(NODE_BODY(node)), map, counter); + r = make_named_capture_number_map(&(NODE_BODY(node)), map, counter); } break; case NODE_ANCHOR: if (IS_NOT_NULL(NODE_BODY(node))) - r = noname_disable_map(&(NODE_BODY(node)), map, counter); + r = make_named_capture_number_map(&(NODE_BODY(node)), map, counter); break; default: @@ -2264,7 +2834,7 @@ noname_disable_map(Node** plink, GroupNumRemap* map, int* counter) } static int -renumber_node_backref(Node* node, GroupNumRemap* map) +renumber_backref_node(Node* node, GroupNumMap* map) { int i, pos, n, old_num; int *backs; @@ -2292,7 +2862,7 @@ renumber_node_backref(Node* node, GroupNumRemap* map) } static int -renumber_by_map(Node* node, GroupNumRemap* map) +renumber_backref_traverse(Node* node, GroupNumMap* map) { int r = 0; @@ -2300,28 +2870,28 @@ renumber_by_map(Node* node, GroupNumRemap* map) case NODE_LIST: case NODE_ALT: do { - r = renumber_by_map(NODE_CAR(node), map); + r = renumber_backref_traverse(NODE_CAR(node), map); } while (r == 0 && IS_NOT_NULL(node = NODE_CDR(node))); break; case NODE_QUANT: - r = renumber_by_map(NODE_BODY(node), map); + r = renumber_backref_traverse(NODE_BODY(node), map); break; case NODE_BAG: { BagNode* en = BAG_(node); - r = renumber_by_map(NODE_BODY(node), map); + r = renumber_backref_traverse(NODE_BODY(node), map); if (r != 0) return r; if (en->type == BAG_IF_ELSE) { if (IS_NOT_NULL(en->te.Then)) { - r = renumber_by_map(en->te.Then, map); + r = renumber_backref_traverse(en->te.Then, map); if (r != 0) return r; } if (IS_NOT_NULL(en->te.Else)) { - r = renumber_by_map(en->te.Else, map); + r = renumber_backref_traverse(en->te.Else, map); if (r != 0) return r; } } @@ -2329,12 +2899,12 @@ renumber_by_map(Node* node, GroupNumRemap* map) break; case NODE_BACKREF: - r = renumber_node_backref(node, map); + r = renumber_backref_node(node, map); break; case NODE_ANCHOR: if (IS_NOT_NULL(NODE_BODY(node))) - r = renumber_by_map(NODE_BODY(node), map); + r = renumber_backref_traverse(NODE_BODY(node), map); break; default: @@ -2403,18 +2973,18 @@ disable_noname_group_capture(Node** root, regex_t* reg, ScanEnv* env) { int r, i, pos, counter; MemStatusType loc; - GroupNumRemap* map; + GroupNumMap* map; - map = (GroupNumRemap* )xalloca(sizeof(GroupNumRemap) * (env->num_mem + 1)); + map = (GroupNumMap* )xalloca(sizeof(GroupNumMap) * (env->num_mem + 1)); CHECK_NULL_RETURN_MEMERR(map); for (i = 1; i <= env->num_mem; i++) { map[i].new_val = 0; } counter = 0; - r = noname_disable_map(root, map, &counter); + r = make_named_capture_number_map(root, map, &counter); if (r != 0) return r; - r = renumber_by_map(*root, map); + r = renumber_backref_traverse(*root, map); if (r != 0) return r; for (i = 1, pos = 1; i <= env->num_mem; i++) { @@ -2448,8 +3018,18 @@ fix_unset_addr_list(UnsetAddrList* uslist, regex_t* reg) AbsAddrType* paddr; for (i = 0; i < uslist->num; i++) { - if (! NODE_IS_ADDR_FIXED(uslist->us[i].target)) - return ONIGERR_PARSER_BUG; + if (! NODE_IS_FIXED_ADDR(uslist->us[i].target)) { + if (NODE_IS_CALLED(uslist->us[i].target)) + return ONIGERR_PARSER_BUG; + else { + /* CASE: called node doesn't have called address. + ex. /((|a\g<1>)(.){0}){0}\g<3>/ + group-1 doesn't called, but compiled into bytecodes, + because group-3 is referred from outside. + */ + continue; + } + } en = BAG_(uslist->us[i].target); addr = en->m.called_addr; @@ -2462,173 +3042,6 @@ fix_unset_addr_list(UnsetAddrList* uslist, regex_t* reg) } #endif - -#define GET_CHAR_LEN_VARLEN -1 -#define GET_CHAR_LEN_TOP_ALT_VARLEN -2 - -/* fixed size pattern node only */ -static int -get_char_len_node1(Node* node, regex_t* reg, int* len, int level) -{ - int tlen; - int r = 0; - - level++; - *len = 0; - switch (NODE_TYPE(node)) { - case NODE_LIST: - do { - r = get_char_len_node1(NODE_CAR(node), reg, &tlen, level); - if (r == 0) - *len = distance_add(*len, tlen); - } while (r == 0 && IS_NOT_NULL(node = NODE_CDR(node))); - break; - - case NODE_ALT: - { - int tlen2; - int varlen = 0; - - r = get_char_len_node1(NODE_CAR(node), reg, &tlen, level); - while (r == 0 && IS_NOT_NULL(node = NODE_CDR(node))) { - r = get_char_len_node1(NODE_CAR(node), reg, &tlen2, level); - if (r == 0) { - if (tlen != tlen2) - varlen = 1; - } - } - if (r == 0) { - if (varlen != 0) { - if (level == 1) - r = GET_CHAR_LEN_TOP_ALT_VARLEN; - else - r = GET_CHAR_LEN_VARLEN; - } - else - *len = tlen; - } - } - break; - - case NODE_STRING: - { - StrNode* sn = STR_(node); - UChar *s = sn->s; - - while (s < sn->end) { - s += enclen(reg->enc, s); - (*len)++; - } - } - break; - - case NODE_QUANT: - { - QuantNode* qn = QUANT_(node); - - if (qn->lower == qn->upper) { - if (qn->upper == 0) { - *len = 0; - } - else { - r = get_char_len_node1(NODE_BODY(node), reg, &tlen, level); - if (r == 0) - *len = distance_multiply(tlen, qn->lower); - } - } - else - r = GET_CHAR_LEN_VARLEN; - } - break; - -#ifdef USE_CALL - case NODE_CALL: - if (! NODE_IS_RECURSION(node)) - r = get_char_len_node1(NODE_BODY(node), reg, len, level); - else - r = GET_CHAR_LEN_VARLEN; - break; -#endif - - case NODE_CTYPE: - case NODE_CCLASS: - *len = 1; - break; - - case NODE_BAG: - { - BagNode* en = BAG_(node); - - switch (en->type) { - case BAG_MEMORY: -#ifdef USE_CALL - if (NODE_IS_CLEN_FIXED(node)) - *len = en->char_len; - else { - r = get_char_len_node1(NODE_BODY(node), reg, len, level); - if (r == 0) { - en->char_len = *len; - NODE_STATUS_ADD(node, CLEN_FIXED); - } - } - break; -#endif - case BAG_OPTION: - case BAG_STOP_BACKTRACK: - r = get_char_len_node1(NODE_BODY(node), reg, len, level); - break; - case BAG_IF_ELSE: - { - int clen, elen; - - r = get_char_len_node1(NODE_BODY(node), reg, &clen, level); - if (r == 0) { - if (IS_NOT_NULL(en->te.Then)) { - r = get_char_len_node1(en->te.Then, reg, &tlen, level); - if (r != 0) break; - } - else tlen = 0; - if (IS_NOT_NULL(en->te.Else)) { - r = get_char_len_node1(en->te.Else, reg, &elen, level); - if (r != 0) break; - } - else elen = 0; - - if (clen + tlen != elen) { - r = GET_CHAR_LEN_VARLEN; - } - else { - *len = elen; - } - } - } - break; - } - } - break; - - case NODE_ANCHOR: - case NODE_GIMMICK: - break; - - case NODE_BACKREF: - if (NODE_IS_CHECKER(node)) - break; - /* fall */ - default: - r = GET_CHAR_LEN_VARLEN; - break; - } - - return r; -} - -static int -get_char_len_node(Node* node, regex_t* reg, int* len) -{ - return get_char_len_node1(node, reg, len, 0); -} - /* x is not included y ==> 1 : 0 */ static int is_exclusive(Node* x, Node* y, regex_t* reg) @@ -2804,14 +3217,9 @@ is_exclusive(Node* x, Node* y, regex_t* reg) len = NODE_STRING_LEN(x); if (len > NODE_STRING_LEN(y)) len = NODE_STRING_LEN(y); - if (NODE_STRING_IS_CASE_FOLD_MATCH(x) || NODE_STRING_IS_CASE_FOLD_MATCH(y)) { - /* tiny version */ - return 0; - } - else { - for (i = 0, p = ys->s, q = xs->s; i < len; i++, p++, q++) { - if (*p != *q) return 1; - } + + for (i = 0, p = ys->s, q = xs->s; i < len; i++, p++, q++) { + if (*p != *q) return 1; } } break; @@ -2830,7 +3238,7 @@ is_exclusive(Node* x, Node* y, regex_t* reg) } static Node* -get_head_value_node(Node* node, int exact, regex_t* reg) +get_tree_head_literal(Node* node, int exact, regex_t* reg) { Node* n = NULL_NODE; @@ -2853,7 +3261,7 @@ get_head_value_node(Node* node, int exact, regex_t* reg) break; case NODE_LIST: - n = get_head_value_node(NODE_CAR(node), exact, reg); + n = get_tree_head_literal(NODE_CAR(node), exact, reg); break; case NODE_STRING: @@ -2864,7 +3272,7 @@ get_head_value_node(Node* node, int exact, regex_t* reg) break; if (exact == 0 || - ! IS_IGNORECASE(reg->options) || NODE_STRING_IS_CRUDE(node)) { + ! NODE_IS_IGNORECASE(node) || NODE_STRING_IS_CRUDE(node)) { n = node; } } @@ -2877,7 +3285,7 @@ get_head_value_node(Node* node, int exact, regex_t* reg) if (IS_NOT_NULL(qn->head_exact)) n = qn->head_exact; else - n = get_head_value_node(NODE_BODY(node), exact, reg); + n = get_tree_head_literal(NODE_BODY(node), exact, reg); } } break; @@ -2887,19 +3295,10 @@ get_head_value_node(Node* node, int exact, regex_t* reg) BagNode* en = BAG_(node); switch (en->type) { case BAG_OPTION: - { - OnigOptionType options = reg->options; - - reg->options = BAG_(node)->o.options; - n = get_head_value_node(NODE_BODY(node), exact, reg); - reg->options = options; - } - break; - case BAG_MEMORY: case BAG_STOP_BACKTRACK: case BAG_IF_ELSE: - n = get_head_value_node(NODE_BODY(node), exact, reg); + n = get_tree_head_literal(NODE_BODY(node), exact, reg); break; } } @@ -2907,7 +3306,7 @@ get_head_value_node(Node* node, int exact, regex_t* reg) case NODE_ANCHOR: if (ANCHOR_(node)->type == ANCR_PREC_READ) - n = get_head_value_node(NODE_BODY(node), exact, reg); + n = get_tree_head_literal(NODE_BODY(node), exact, reg); break; case NODE_GIMMICK: @@ -2918,42 +3317,244 @@ get_head_value_node(Node* node, int exact, regex_t* reg) return n; } +enum GetValue { + GET_VALUE_NONE = -1, + GET_VALUE_IGNORE = 0, + GET_VALUE_FOUND = 1 +}; + +static int +get_tree_tail_literal(Node* node, Node** rnode, regex_t* reg) +{ + int r; + + switch (NODE_TYPE(node)) { + case NODE_LIST: + if (IS_NULL(NODE_CDR(node))) { + r = get_tree_tail_literal(NODE_CAR(node), rnode, reg); + } + else { + r = get_tree_tail_literal(NODE_CDR(node), rnode, reg); + if (r == GET_VALUE_IGNORE) { + r = get_tree_tail_literal(NODE_CAR(node), rnode, reg); + } + } + break; + +#ifdef USE_CALL + case NODE_CALL: + r = get_tree_tail_literal(NODE_BODY(node), rnode, reg); + break; +#endif + + case NODE_CTYPE: + if (CTYPE_(node)->ctype == CTYPE_ANYCHAR) { + r = GET_VALUE_NONE; + break; + } + /* fall */ + case NODE_CCLASS: + *rnode = node; + r = GET_VALUE_FOUND; + break; + + case NODE_STRING: + { + StrNode* sn = STR_(node); + + if (sn->end <= sn->s) { + r = GET_VALUE_IGNORE; + break; + } + + if (NODE_IS_IGNORECASE(node) && ! NODE_STRING_IS_CRUDE(node)) { + r = GET_VALUE_NONE; + break; + } + + *rnode = node; + r = GET_VALUE_FOUND; + } + break; + + case NODE_QUANT: + { + QuantNode* qn = QUANT_(node); + if (qn->lower != 0) { + r = get_tree_tail_literal(NODE_BODY(node), rnode, reg); + } + else + r = GET_VALUE_NONE; + } + break; + + case NODE_BAG: + { + BagNode* en = BAG_(node); + + if (en->type == BAG_MEMORY) { + if (NODE_IS_MARK1(node)) + r = GET_VALUE_NONE; + else { + NODE_STATUS_ADD(node, MARK1); + r = get_tree_tail_literal(NODE_BODY(node), rnode, reg); + NODE_STATUS_REMOVE(node, MARK1); + } + } + else { + r = get_tree_tail_literal(NODE_BODY(node), rnode, reg); + } + } + break; + + case NODE_ANCHOR: + case NODE_GIMMICK: + r = GET_VALUE_IGNORE; + break; + + case NODE_ALT: + case NODE_BACKREF: + default: + r = GET_VALUE_NONE; + break; + } + + return r; +} + static int -check_type_tree(Node* node, int type_mask, int bag_mask, int anchor_mask) +check_called_node_in_look_behind(Node* node, int not) { + int r; + + r = 0; + + switch (NODE_TYPE(node)) { + case NODE_LIST: + case NODE_ALT: + do { + r = check_called_node_in_look_behind(NODE_CAR(node), not); + } while (r == 0 && IS_NOT_NULL(node = NODE_CDR(node))); + break; + + case NODE_QUANT: + r = check_called_node_in_look_behind(NODE_BODY(node), not); + break; + + case NODE_BAG: + { + BagNode* en = BAG_(node); + + if (en->type == BAG_MEMORY) { + if (NODE_IS_MARK1(node)) + return 0; + else { + NODE_STATUS_ADD(node, MARK1); + r = check_called_node_in_look_behind(NODE_BODY(node), not); + NODE_STATUS_REMOVE(node, MARK1); + } + } + else { + r = check_called_node_in_look_behind(NODE_BODY(node), not); + if (r == 0 && en->type == BAG_IF_ELSE) { + if (IS_NOT_NULL(en->te.Then)) { + r = check_called_node_in_look_behind(en->te.Then, not); + if (r != 0) break; + } + if (IS_NOT_NULL(en->te.Else)) { + r = check_called_node_in_look_behind(en->te.Else, not); + } + } + } + } + break; + + case NODE_ANCHOR: + if (IS_NOT_NULL(NODE_BODY(node))) + r = check_called_node_in_look_behind(NODE_BODY(node), not); + break; + + case NODE_GIMMICK: + if (NODE_IS_ABSENT_WITH_SIDE_EFFECTS(node) != 0) + return 1; + break; + + default: + break; + } + + return r; +} + +/* allowed node types in look-behind */ +#define ALLOWED_TYPE_IN_LB \ + ( NODE_BIT_LIST | NODE_BIT_ALT | NODE_BIT_STRING | NODE_BIT_CCLASS \ + | NODE_BIT_CTYPE | NODE_BIT_ANCHOR | NODE_BIT_BAG | NODE_BIT_QUANT \ + | NODE_BIT_CALL | NODE_BIT_BACKREF | NODE_BIT_GIMMICK) + +#define ALLOWED_BAG_IN_LB ( 1<<BAG_MEMORY | 1<<BAG_OPTION | 1<<BAG_STOP_BACKTRACK | 1<<BAG_IF_ELSE ) +#define ALLOWED_BAG_IN_LB_NOT ( 1<<BAG_OPTION | 1<<BAG_STOP_BACKTRACK | 1<<BAG_IF_ELSE ) + +#define ALLOWED_ANCHOR_IN_LB \ + ( ANCR_LOOK_BEHIND | ANCR_BEGIN_LINE | ANCR_END_LINE | ANCR_BEGIN_BUF \ + | ANCR_BEGIN_POSITION | ANCR_WORD_BOUNDARY | ANCR_NO_WORD_BOUNDARY \ + | ANCR_WORD_BEGIN | ANCR_WORD_END \ + | ANCR_TEXT_SEGMENT_BOUNDARY | ANCR_NO_TEXT_SEGMENT_BOUNDARY ) + +#define ALLOWED_ANCHOR_IN_LB_NOT \ + ( ANCR_LOOK_BEHIND | ANCR_LOOK_BEHIND_NOT | ANCR_BEGIN_LINE \ + | ANCR_END_LINE | ANCR_BEGIN_BUF | ANCR_BEGIN_POSITION | ANCR_WORD_BOUNDARY \ + | ANCR_NO_WORD_BOUNDARY | ANCR_WORD_BEGIN | ANCR_WORD_END \ + | ANCR_TEXT_SEGMENT_BOUNDARY | ANCR_NO_TEXT_SEGMENT_BOUNDARY ) + + +static int +check_node_in_look_behind(Node* node, int not, int* used) +{ + static unsigned int + bag_mask[2] = { ALLOWED_BAG_IN_LB, ALLOWED_BAG_IN_LB_NOT }; + + static unsigned int + anchor_mask[2] = { ALLOWED_ANCHOR_IN_LB, ALLOWED_ANCHOR_IN_LB_NOT }; + NodeType type; int r = 0; type = NODE_TYPE(node); - if ((NODE_TYPE2BIT(type) & type_mask) == 0) + if ((NODE_TYPE2BIT(type) & ALLOWED_TYPE_IN_LB) == 0) return 1; switch (type) { case NODE_LIST: case NODE_ALT: do { - r = check_type_tree(NODE_CAR(node), type_mask, bag_mask, anchor_mask); + r = check_node_in_look_behind(NODE_CAR(node), not, used); } while (r == 0 && IS_NOT_NULL(node = NODE_CDR(node))); break; case NODE_QUANT: - r = check_type_tree(NODE_BODY(node), type_mask, bag_mask, anchor_mask); + r = check_node_in_look_behind(NODE_BODY(node), not, used); break; case NODE_BAG: { BagNode* en = BAG_(node); - if (((1<<en->type) & bag_mask) == 0) + if (((1<<en->type) & bag_mask[not]) == 0) return 1; - r = check_type_tree(NODE_BODY(node), type_mask, bag_mask, anchor_mask); - if (r == 0 && en->type == BAG_IF_ELSE) { + r = check_node_in_look_behind(NODE_BODY(node), not, used); + if (r != 0) break; + + if (en->type == BAG_MEMORY) { + if (NODE_IS_BACKREF(node) || NODE_IS_CALLED(node)) *used = TRUE; + } + else if (en->type == BAG_IF_ELSE) { if (IS_NOT_NULL(en->te.Then)) { - r = check_type_tree(en->te.Then, type_mask, bag_mask, anchor_mask); + r = check_node_in_look_behind(en->te.Then, not, used); if (r != 0) break; } if (IS_NOT_NULL(en->te.Else)) { - r = check_type_tree(en->te.Else, type_mask, bag_mask, anchor_mask); + r = check_node_in_look_behind(en->te.Else, not, used); } } } @@ -2961,14 +3562,22 @@ check_type_tree(Node* node, int type_mask, int bag_mask, int anchor_mask) case NODE_ANCHOR: type = ANCHOR_(node)->type; - if ((type & anchor_mask) == 0) + if ((type & anchor_mask[not]) == 0) return 1; if (IS_NOT_NULL(NODE_BODY(node))) - r = check_type_tree(NODE_BODY(node), type_mask, bag_mask, anchor_mask); + r = check_node_in_look_behind(NODE_BODY(node), not, used); break; case NODE_GIMMICK: + if (NODE_IS_ABSENT_WITH_SIDE_EFFECTS(node) != 0) + return 1; + break; + + case NODE_CALL: + r = check_called_node_in_look_behind(NODE_BODY(node), not); + break; + default: break; } @@ -2976,7 +3585,7 @@ check_type_tree(Node* node, int type_mask, int bag_mask, int anchor_mask) } static OnigLen -tree_min_len(Node* node, ScanEnv* env) +node_min_byte_len(Node* node, ScanEnv* env) { OnigLen len; OnigLen tmin; @@ -2992,9 +3601,9 @@ tree_min_len(Node* node, ScanEnv* env) if (NODE_IS_RECURSION(node)) break; backs = BACKREFS_P(br); - len = tree_min_len(mem_env[backs[0]].mem_node, env); + len = node_min_byte_len(mem_env[backs[0]].mem_node, env); for (i = 1; i < br->back_num; i++) { - tmin = tree_min_len(mem_env[backs[i]].mem_node, env); + tmin = node_min_byte_len(mem_env[backs[i]].mem_node, env); if (len > tmin) len = tmin; } } @@ -3005,18 +3614,18 @@ tree_min_len(Node* node, ScanEnv* env) { Node* t = NODE_BODY(node); if (NODE_IS_RECURSION(node)) { - if (NODE_IS_MIN_FIXED(t)) + if (NODE_IS_FIXED_MIN(t)) len = BAG_(t)->min_len; } else - len = tree_min_len(t, env); + len = node_min_byte_len(t, env); } break; #endif case NODE_LIST: do { - tmin = tree_min_len(NODE_CAR(node), env); + tmin = node_min_byte_len(NODE_CAR(node), env); len = distance_add(len, tmin); } while (IS_NOT_NULL(node = NODE_CDR(node))); break; @@ -3027,7 +3636,7 @@ tree_min_len(Node* node, ScanEnv* env) y = node; do { x = NODE_CAR(y); - tmin = tree_min_len(x, env); + tmin = node_min_byte_len(x, env); if (y == node) len = tmin; else if (len > tmin) len = tmin; } while (IS_NOT_NULL(y = NODE_CDR(y))); @@ -3051,7 +3660,7 @@ tree_min_len(Node* node, ScanEnv* env) QuantNode* qn = QUANT_(node); if (qn->lower > 0) { - len = tree_min_len(NODE_BODY(node), env); + len = node_min_byte_len(NODE_BODY(node), env); len = distance_multiply(len, qn->lower); } } @@ -3062,35 +3671,35 @@ tree_min_len(Node* node, ScanEnv* env) BagNode* en = BAG_(node); switch (en->type) { case BAG_MEMORY: - if (NODE_IS_MIN_FIXED(node)) + if (NODE_IS_FIXED_MIN(node)) len = en->min_len; else { if (NODE_IS_MARK1(node)) len = 0; /* recursive */ else { NODE_STATUS_ADD(node, MARK1); - len = tree_min_len(NODE_BODY(node), env); + len = node_min_byte_len(NODE_BODY(node), env); NODE_STATUS_REMOVE(node, MARK1); en->min_len = len; - NODE_STATUS_ADD(node, MIN_FIXED); + NODE_STATUS_ADD(node, FIXED_MIN); } } break; case BAG_OPTION: case BAG_STOP_BACKTRACK: - len = tree_min_len(NODE_BODY(node), env); + len = node_min_byte_len(NODE_BODY(node), env); break; case BAG_IF_ELSE: { OnigLen elen; - len = tree_min_len(NODE_BODY(node), env); + len = node_min_byte_len(NODE_BODY(node), env); if (IS_NOT_NULL(en->te.Then)) - len += tree_min_len(en->te.Then, env); + len += node_min_byte_len(en->te.Then, env); if (IS_NOT_NULL(en->te.Else)) - elen = tree_min_len(en->te.Else, env); + elen = node_min_byte_len(en->te.Else, env); else elen = 0; if (elen < len) len = elen; @@ -3118,7 +3727,7 @@ tree_min_len(Node* node, ScanEnv* env) } static OnigLen -tree_max_len(Node* node, ScanEnv* env) +node_max_byte_len(Node* node, ScanEnv* env) { OnigLen len; OnigLen tmax; @@ -3127,14 +3736,14 @@ tree_max_len(Node* node, ScanEnv* env) switch (NODE_TYPE(node)) { case NODE_LIST: do { - tmax = tree_max_len(NODE_CAR(node), env); + tmax = node_max_byte_len(NODE_CAR(node), env); len = distance_add(len, tmax); } while (IS_NOT_NULL(node = NODE_CDR(node))); break; case NODE_ALT: do { - tmax = tree_max_len(NODE_CAR(node), env); + tmax = node_max_byte_len(NODE_CAR(node), env); if (len < tmax) len = tmax; } while (IS_NOT_NULL(node = NODE_CDR(node))); break; @@ -3158,12 +3767,16 @@ tree_max_len(Node* node, ScanEnv* env) MemEnv* mem_env = SCANENV_MEMENV(env); BackRefNode* br = BACKREF_(node); if (NODE_IS_RECURSION(node)) { - len = INFINITE_LEN; +#ifdef USE_BACKREF_WITH_LEVEL + if (NODE_IS_NEST_LEVEL(node)) { + len = INFINITE_LEN; + } +#endif break; } backs = BACKREFS_P(br); for (i = 0; i < br->back_num; i++) { - tmax = tree_max_len(mem_env[backs[i]].mem_node, env); + tmax = node_max_byte_len(mem_env[backs[i]].mem_node, env); if (len < tmax) len = tmax; } } @@ -3172,7 +3785,7 @@ tree_max_len(Node* node, ScanEnv* env) #ifdef USE_CALL case NODE_CALL: if (! NODE_IS_RECURSION(node)) - len = tree_max_len(NODE_BODY(node), env); + len = node_max_byte_len(NODE_BODY(node), env); else len = INFINITE_LEN; break; @@ -3183,7 +3796,7 @@ tree_max_len(Node* node, ScanEnv* env) QuantNode* qn = QUANT_(node); if (qn->upper != 0) { - len = tree_max_len(NODE_BODY(node), env); + len = node_max_byte_len(NODE_BODY(node), env); if (len != 0) { if (! IS_INFINITE_REPEAT(qn->upper)) len = distance_multiply(len, qn->upper); @@ -3199,37 +3812,37 @@ tree_max_len(Node* node, ScanEnv* env) BagNode* en = BAG_(node); switch (en->type) { case BAG_MEMORY: - if (NODE_IS_MAX_FIXED(node)) + if (NODE_IS_FIXED_MAX(node)) len = en->max_len; else { if (NODE_IS_MARK1(node)) len = INFINITE_LEN; else { NODE_STATUS_ADD(node, MARK1); - len = tree_max_len(NODE_BODY(node), env); + len = node_max_byte_len(NODE_BODY(node), env); NODE_STATUS_REMOVE(node, MARK1); en->max_len = len; - NODE_STATUS_ADD(node, MAX_FIXED); + NODE_STATUS_ADD(node, FIXED_MAX); } } break; case BAG_OPTION: case BAG_STOP_BACKTRACK: - len = tree_max_len(NODE_BODY(node), env); + len = node_max_byte_len(NODE_BODY(node), env); break; case BAG_IF_ELSE: { OnigLen tlen, elen; - len = tree_max_len(NODE_BODY(node), env); + len = node_max_byte_len(NODE_BODY(node), env); if (IS_NOT_NULL(en->te.Then)) { - tlen = tree_max_len(en->te.Then, env); + tlen = node_max_byte_len(en->te.Then, env); len = distance_add(len, tlen); } if (IS_NOT_NULL(en->te.Else)) - elen = tree_max_len(en->te.Else, env); + elen = node_max_byte_len(en->te.Else, env); else elen = 0; if (elen > len) len = elen; @@ -3537,7 +4150,7 @@ infinite_recursive_call_check(Node* node, ScanEnv* env, int head) if (ret < 0 || (ret & RECURSION_INFINITE) != 0) return ret; r |= ret; if (head != 0) { - min = tree_min_len(NODE_CAR(x), env); + min = node_min_byte_len(NODE_CAR(x), env); if (min != 0) head = 0; } } while (IS_NOT_NULL(x = NODE_CDR(x))); @@ -3602,7 +4215,7 @@ infinite_recursive_call_check(Node* node, ScanEnv* env, int head) if (IS_NOT_NULL(en->te.Then)) { OnigLen min; if (head != 0) { - min = tree_min_len(NODE_BODY(node), env); + min = node_min_byte_len(NODE_BODY(node), env); } else min = 0; @@ -3888,7 +4501,9 @@ reduce_string_list(Node* node) next_node = NODE_CDR(node); curr = NODE_CAR(node); if (NODE_TYPE(curr) == NODE_STRING) { - if (IS_NULL(prev) || STR_(curr)->flag != STR_(prev)->flag) { + if (IS_NULL(prev) + || STR_(curr)->flag != STR_(prev)->flag + || NODE_STATUS(curr) != NODE_STATUS(prev)) { prev = curr; prev_node = node; } @@ -3967,9 +4582,13 @@ reduce_string_list(Node* node) static int divide_look_behind_alternatives(Node* node) { + int r; + int anc_type; Node *head, *np, *insert_node; - AnchorNode* an = ANCHOR_(node); - int anc_type = an->type; + AnchorNode* an; + + an = ANCHOR_(node); + anc_type = an->type; head = NODE_ANCHOR_BODY(an); np = NODE_CAR(head); @@ -3979,7 +4598,8 @@ divide_look_behind_alternatives(Node* node) np = node; while (IS_NOT_NULL(np = NODE_CDR(np))) { - insert_node = onig_node_new_anchor(anc_type, an->ascii_mode); + r = onig_node_copy(&insert_node, head); + if (r != 0) return r; CHECK_NULL_RETURN_MEMERR(insert_node); NODE_BODY(insert_node) = NODE_CAR(np); NODE_CAR(np) = insert_node; @@ -3995,21 +4615,162 @@ divide_look_behind_alternatives(Node* node) } static int -tune_look_behind(Node* node, regex_t* reg, ScanEnv* env) +node_reduce_in_look_behind(Node* node) { - int r, len; + NodeType type; + Node* body; + + if (NODE_TYPE(node) != NODE_QUANT) return 0; + + body = NODE_BODY(node); + type = NODE_TYPE(body); + if (type == NODE_STRING || type == NODE_CTYPE || + type == NODE_CCLASS || type == NODE_BACKREF) { + QuantNode* qn = QUANT_(node); + qn->upper = qn->lower; + if (qn->upper == 0) + return 1; /* removed */ + } + + return 0; +} + +static int +list_reduce_in_look_behind(Node* node) +{ + int r; + + switch (NODE_TYPE(node)) { + case NODE_QUANT: + r = node_reduce_in_look_behind(node); + if (r > 0) r = 0; + break; + + case NODE_LIST: + do { + r = node_reduce_in_look_behind(NODE_CAR(node)); + if (r <= 0) break; + } while (IS_NOT_NULL(node = NODE_CDR(node))); + break; + + default: + r = 0; + break; + } + + return r; +} + +static int +alt_reduce_in_look_behind(Node* node, regex_t* reg, ScanEnv* env) +{ + int r; + + switch (NODE_TYPE(node)) { + case NODE_ALT: + do { + r = list_reduce_in_look_behind(NODE_CAR(node)); + } while (r == 0 && IS_NOT_NULL(node = NODE_CDR(node))); + break; + + default: + r = list_reduce_in_look_behind(node); + break; + } + + return r; +} + +static int tune_tree(Node* node, regex_t* reg, int state, ScanEnv* env); + +static int +tune_look_behind(Node* node, regex_t* reg, int state, ScanEnv* env) +{ + int r; + int state1; + int used; + MinMaxCharLen ci; + Node* body; AnchorNode* an = ANCHOR_(node); - r = get_char_len_node(NODE_ANCHOR_BODY(an), reg, &len); - if (r == 0) - an->char_len = len; - else if (r == GET_CHAR_LEN_VARLEN) - r = ONIGERR_INVALID_LOOK_BEHIND_PATTERN; - else if (r == GET_CHAR_LEN_TOP_ALT_VARLEN) { - if (IS_SYNTAX_BV(env->syntax, ONIG_SYN_DIFFERENT_LEN_ALT_LOOK_BEHIND)) - r = divide_look_behind_alternatives(node); - else - r = ONIGERR_INVALID_LOOK_BEHIND_PATTERN; + used = FALSE; + r = check_node_in_look_behind(NODE_ANCHOR_BODY(an), + an->type == ANCR_LOOK_BEHIND_NOT ? 1 : 0, + &used); + if (r < 0) return r; + if (r > 0) return ONIGERR_INVALID_LOOK_BEHIND_PATTERN; + + if (an->type == ANCR_LOOK_BEHIND_NOT) + state1 = state | IN_NOT | IN_LOOK_BEHIND; + else + state1 = state | IN_LOOK_BEHIND; + + body = NODE_ANCHOR_BODY(an); + /* Execute tune_tree(body) before call node_char_len(). + Because case-fold expansion must be done before node_char_len(). + */ + r = tune_tree(body, reg, state1, env); + if (r != 0) return r; + + r = alt_reduce_in_look_behind(body, reg, env); + if (r != 0) return r; + + r = node_char_len(body, reg, &ci, env); + if (r >= 0) { + /* #177: overflow in onigenc_step_back() */ + if ((ci.max != INFINITE_LEN && ci.max > LOOK_BEHIND_MAX_CHAR_LEN) + || ci.min > LOOK_BEHIND_MAX_CHAR_LEN) { + return ONIGERR_INVALID_LOOK_BEHIND_PATTERN; + } + + if (ci.min == 0 && ci.min_is_sure != 0 && used == FALSE) { + if (an->type == ANCR_LOOK_BEHIND_NOT) + r = onig_node_reset_fail(node); + else + r = onig_node_reset_empty(node); + + return r; + } + + if (r == CHAR_LEN_TOP_ALT_FIXED) { + if (IS_SYNTAX_BV(env->syntax, ONIG_SYN_DIFFERENT_LEN_ALT_LOOK_BEHIND)) { + r = divide_look_behind_alternatives(node); + if (r == 0) + r = tune_tree(node, reg, state, env); + } + else if (IS_SYNTAX_BV(env->syntax, ONIG_SYN_VARIABLE_LEN_LOOK_BEHIND)) + goto normal; + else + r = ONIGERR_INVALID_LOOK_BEHIND_PATTERN; + } + else { /* CHAR_LEN_NORMAL */ + normal: + if (ci.min == INFINITE_LEN) { + r = ONIGERR_INVALID_LOOK_BEHIND_PATTERN; + } + else { + if (ci.min != ci.max && + ! IS_SYNTAX_BV(env->syntax, ONIG_SYN_VARIABLE_LEN_LOOK_BEHIND)) { + r = ONIGERR_INVALID_LOOK_BEHIND_PATTERN; + } + else { + Node* tail; + + /* check lead_node is already set by double call after + divide_look_behind_alternatives() */ + if (IS_NULL(an->lead_node)) { + an->char_min_len = ci.min; + an->char_max_len = ci.max; + r = get_tree_tail_literal(body, &tail, reg); + if (r == GET_VALUE_FOUND) { + r = onig_node_copy(&(an->lead_node), tail); + if (r != 0) return r; + } + } + r = ONIG_NORMAL; + } + } + } } return r; @@ -4026,7 +4787,7 @@ tune_next(Node* node, Node* next_node, regex_t* reg) QuantNode* qn = QUANT_(node); if (qn->greedy && IS_INFINITE_REPEAT(qn->upper)) { #ifdef USE_QUANT_PEEK_NEXT - Node* n = get_head_value_node(next_node, 1, reg); + Node* n = get_tree_head_literal(next_node, 1, reg); /* '\0': for UTF-16BE etc... */ if (IS_NOT_NULL(n) && STR_(n)->s[0] != '\0') { qn->next_head_exact = n; @@ -4036,9 +4797,9 @@ tune_next(Node* node, Node* next_node, regex_t* reg) if (qn->lower <= 1) { if (is_strict_real_node(NODE_BODY(node))) { Node *x, *y; - x = get_head_value_node(NODE_BODY(node), 0, reg); + x = get_tree_head_literal(NODE_BODY(node), 0, reg); if (IS_NOT_NULL(x)) { - y = get_head_value_node(next_node, 0, reg); + y = get_tree_head_literal(next_node, 0, reg); if (IS_NOT_NULL(y) && is_exclusive(x, y, reg)) { Node* en = onig_node_new_bag(BAG_STOP_BACKTRACK); CHECK_NULL_RETURN_MEMERR(en); @@ -4076,11 +4837,13 @@ is_all_code_len_1_items(int n, OnigCaseFoldCodeItem items[]) } static int -get_min_max_byte_len_case_fold_items(int n, OnigCaseFoldCodeItem items[], int* rmin, int* rmax) +get_min_max_byte_len_case_fold_items(int n, OnigCaseFoldCodeItem items[], + OnigLen* rmin, OnigLen* rmax) { - int i, len, minlen, maxlen; + int i; + OnigLen len, minlen, maxlen; - minlen = INT_MAX; + minlen = INFINITE_LEN; maxlen = 0; for (i = 0; i < n; i++) { OnigCaseFoldCodeItem* item = items + i; @@ -4096,45 +4859,6 @@ get_min_max_byte_len_case_fold_items(int n, OnigCaseFoldCodeItem items[], int* r } static int -conv_string_case_fold(OnigEncoding enc, OnigCaseFoldType case_fold_flag, - UChar* s, UChar* end, UChar** rs, UChar** rend, int* rcase_min_len) -{ - UChar *p, buf[ONIGENC_MBC_CASE_FOLD_MAXLEN]; - UChar *sbuf, *ebuf, *sp; - int i, n, len, sbuf_size; - - *rs = NULL; - sbuf_size = (int )(end - s) * 2; - sbuf = (UChar* )xmalloc(sbuf_size); - CHECK_NULL_RETURN_MEMERR(sbuf); - ebuf = sbuf + sbuf_size; - - n = 0; - sp = sbuf; - p = s; - while (p < end) { - len = ONIGENC_MBC_CASE_FOLD(enc, case_fold_flag, &p, end, buf); - for (i = 0; i < len; i++) { - if (sp >= ebuf) { - sbuf = (UChar* )xrealloc(sbuf, sbuf_size * 2); - CHECK_NULL_RETURN_MEMERR(sbuf); - sp = sbuf + sbuf_size; - sbuf_size *= 2; - ebuf = sbuf + sbuf_size; - } - - *sp++ = buf[i]; - } - n++; - } - - *rs = sbuf; - *rend = sp; - *rcase_min_len = n; - return 0; -} - -static int make_code_list_to_string(Node** rnode, OnigEncoding enc, int n, OnigCodePoint codes[]) { @@ -4186,7 +4910,7 @@ unravel_cf_node_add(Node** rlist, Node* add) static int unravel_cf_string_add(Node** rlist, Node** rsn, UChar* s, UChar* end, - unsigned int flag, int case_min_len) + unsigned int flag) { int r; Node *sn, *list; @@ -4195,17 +4919,13 @@ unravel_cf_string_add(Node** rlist, Node** rsn, UChar* s, UChar* end, sn = *rsn; if (IS_NOT_NULL(sn) && STR_(sn)->flag == flag) { - if (NODE_STRING_IS_CASE_FOLD_MATCH(sn)) - r = node_str_cat_case_fold(sn, s, end, case_min_len); - else - r = onig_node_str_cat(sn, s, end); + r = onig_node_str_cat(sn, s, end); } else { sn = onig_node_new_str(s, end); CHECK_NULL_RETURN_MEMERR(sn); STR_(sn)->flag = flag; - STR_(sn)->case_min_len = case_min_len; r = unravel_cf_node_add(&list, sn); } @@ -4217,27 +4937,8 @@ unravel_cf_string_add(Node** rlist, Node** rsn, UChar* s, UChar* end, } static int -unravel_cf_string_fold_add(Node** rlist, Node** rsn, OnigEncoding enc, - OnigCaseFoldType case_fold_flag, UChar* s, UChar* end) -{ - int r; - int case_min_len; - UChar *rs, *rend; - - r = conv_string_case_fold(enc, case_fold_flag, s, end, - &rs, &rend, &case_min_len); - if (r != 0) return r; - - r = unravel_cf_string_add(rlist, rsn, rs, rend, - NODE_STRING_CASE_FOLD_MATCH, case_min_len); - xfree(rs); - - return r; -} - -static int unravel_cf_string_alt_or_cc_add(Node** rlist, int n, - OnigCaseFoldCodeItem items[], int byte_len, OnigEncoding enc, + OnigCaseFoldCodeItem items[], OnigEncoding enc, OnigCaseFoldType case_fold_flag, UChar* s, UChar* end) { int r, i; @@ -4294,7 +4995,7 @@ unravel_cf_string_alt_or_cc_add(Node** rlist, int n, static int unravel_cf_look_behind_add(Node** rlist, Node** rsn, int n, OnigCaseFoldCodeItem items[], OnigEncoding enc, - UChar* s, int one_len) + UChar* s, OnigLen one_len) { int r, i, found; @@ -4309,7 +5010,7 @@ unravel_cf_look_behind_add(Node** rlist, Node** rsn, } if (found == 0) { - r = unravel_cf_string_add(rlist, rsn, s, s + one_len, 0 /* flag */, 0); + r = unravel_cf_string_add(rlist, rsn, s, s + one_len, 0 /* flag */); } else { Node* node; @@ -4340,7 +5041,8 @@ unravel_cf_look_behind_add(Node** rlist, Node** rsn, static int unravel_case_fold_string(Node* node, regex_t* reg, int state) { - int r, n, one_len, min_len, max_len, in_look_behind; + int r, n, in_look_behind; + OnigLen min_len, max_len, one_len; UChar *start, *end, *p, *q; StrNode* snode; Node *sn, *list; @@ -4349,8 +5051,8 @@ unravel_case_fold_string(Node* node, regex_t* reg, int state) if (NODE_STRING_IS_CASE_EXPANDED(node)) return 0; + NODE_STATUS_REMOVE(node, IGNORECASE); snode = STR_(node); - start = snode->s; end = snode->end; if (start >= end) return 0; @@ -4368,32 +5070,36 @@ unravel_case_fold_string(Node* node, regex_t* reg, int state) goto err; } - one_len = enclen(enc, p); + one_len = (OnigLen )enclen(enc, p); if (n == 0) { q = p + one_len; - r = unravel_cf_string_add(&list, &sn, p, q, 0 /* flag */, 0); + r = unravel_cf_string_add(&list, &sn, p, q, 0 /* flag */); if (r != 0) goto err; } else { if (in_look_behind != 0) { q = p + one_len; + if (items[0].byte_len != one_len) { + r = ONIGENC_GET_CASE_FOLD_CODES_BY_STR(enc, reg->case_fold_flag, p, q, + items); + if (r < 0) goto err; + n = r; + } r = unravel_cf_look_behind_add(&list, &sn, n, items, enc, p, one_len); if (r != 0) goto err; } else { get_min_max_byte_len_case_fold_items(n, items, &min_len, &max_len); - q = p + max_len; - if (one_len == max_len && min_len == max_len) { - r = unravel_cf_string_alt_or_cc_add(&list, n, items, max_len, enc, - reg->case_fold_flag, p, q); - if (r != 0) goto err; - sn = NULL_NODE; - } - else { - r = unravel_cf_string_fold_add(&list, &sn, enc, reg->case_fold_flag, - p, q); - if (r != 0) goto err; + if (min_len != max_len) { + r = ONIGERR_PARSER_BUG; + goto err; } + + q = p + max_len; + r = unravel_cf_string_alt_or_cc_add(&list, n, items, enc, + reg->case_fold_flag, p, q); + if (r != 0) goto err; + sn = NULL_NODE; } } @@ -4428,7 +5134,7 @@ unravel_case_fold_string(Node* node, regex_t* reg, int state) static enum BodyEmptyType quantifiers_memory_node_info(Node* node) { - int r = BODY_IS_EMPTY_POSSIBILITY; + int r = BODY_MAY_BE_EMPTY; switch (NODE_TYPE(node)) { case NODE_LIST: @@ -4445,7 +5151,7 @@ quantifiers_memory_node_info(Node* node) #ifdef USE_CALL case NODE_CALL: if (NODE_IS_RECURSION(node)) { - return BODY_IS_EMPTY_POSSIBILITY_REC; /* tiny version */ + return BODY_MAY_BE_EMPTY_REC; /* tiny version */ } else r = quantifiers_memory_node_info(NODE_BODY(node)); @@ -4467,9 +5173,9 @@ quantifiers_memory_node_info(Node* node) switch (en->type) { case BAG_MEMORY: if (NODE_IS_RECURSION(node)) { - return BODY_IS_EMPTY_POSSIBILITY_REC; + return BODY_MAY_BE_EMPTY_REC; } - return BODY_IS_EMPTY_POSSIBILITY_MEM; + return BODY_MAY_BE_EMPTY_MEM; break; case BAG_OPTION: @@ -4524,7 +5230,7 @@ tune_call_node_call(CallNode* cn, ScanEnv* env, int state) if (env->num_named > 0 && IS_SYNTAX_BV(env->syntax, ONIG_SYN_CAPTURE_ONLY_NAMED_GROUP) && - ! ONIG_IS_OPTION_ON(env->options, ONIG_OPTION_CAPTURE_GROUP)) { + ! OPTON_CAPTURE_GROUP(env->options)) { return ONIGERR_NUMBERED_BACKREF_OR_CALL_NOT_ALLOWED; } @@ -4935,35 +5641,12 @@ tune_called_state(Node* node, int state) #endif /* USE_CALL */ -static int tune_tree(Node* node, regex_t* reg, int state, ScanEnv* env); - #ifdef __GNUC__ __inline #endif static int tune_anchor(Node* node, regex_t* reg, int state, ScanEnv* env) { -/* allowed node types in look-behind */ -#define ALLOWED_TYPE_IN_LB \ - ( NODE_BIT_LIST | NODE_BIT_ALT | NODE_BIT_STRING | NODE_BIT_CCLASS \ - | NODE_BIT_CTYPE | NODE_BIT_ANCHOR | NODE_BIT_BAG | NODE_BIT_QUANT \ - | NODE_BIT_CALL | NODE_BIT_GIMMICK) - -#define ALLOWED_BAG_IN_LB ( 1<<BAG_MEMORY | 1<<BAG_OPTION | 1<<BAG_IF_ELSE ) -#define ALLOWED_BAG_IN_LB_NOT ( 1<<BAG_OPTION | 1<<BAG_IF_ELSE ) - -#define ALLOWED_ANCHOR_IN_LB \ - ( ANCR_LOOK_BEHIND | ANCR_BEGIN_LINE | ANCR_END_LINE | ANCR_BEGIN_BUF \ - | ANCR_BEGIN_POSITION | ANCR_WORD_BOUNDARY | ANCR_NO_WORD_BOUNDARY \ - | ANCR_WORD_BEGIN | ANCR_WORD_END \ - | ANCR_TEXT_SEGMENT_BOUNDARY | ANCR_NO_TEXT_SEGMENT_BOUNDARY ) - -#define ALLOWED_ANCHOR_IN_LB_NOT \ - ( ANCR_LOOK_BEHIND | ANCR_LOOK_BEHIND_NOT | ANCR_BEGIN_LINE \ - | ANCR_END_LINE | ANCR_BEGIN_BUF | ANCR_BEGIN_POSITION | ANCR_WORD_BOUNDARY \ - | ANCR_NO_WORD_BOUNDARY | ANCR_WORD_BEGIN | ANCR_WORD_END \ - | ANCR_TEXT_SEGMENT_BOUNDARY | ANCR_NO_TEXT_SEGMENT_BOUNDARY ) - int r; AnchorNode* an = ANCHOR_(node); @@ -4976,28 +5659,8 @@ tune_anchor(Node* node, regex_t* reg, int state, ScanEnv* env) break; case ANCR_LOOK_BEHIND: - { - r = check_type_tree(NODE_ANCHOR_BODY(an), ALLOWED_TYPE_IN_LB, - ALLOWED_BAG_IN_LB, ALLOWED_ANCHOR_IN_LB); - if (r < 0) return r; - if (r > 0) return ONIGERR_INVALID_LOOK_BEHIND_PATTERN; - r = tune_tree(NODE_ANCHOR_BODY(an), reg, (state|IN_LOOK_BEHIND), env); - if (r != 0) return r; - r = tune_look_behind(node, reg, env); - } - break; - case ANCR_LOOK_BEHIND_NOT: - { - r = check_type_tree(NODE_ANCHOR_BODY(an), ALLOWED_TYPE_IN_LB, - ALLOWED_BAG_IN_LB_NOT, ALLOWED_ANCHOR_IN_LB_NOT); - if (r < 0) return r; - if (r > 0) return ONIGERR_INVALID_LOOK_BEHIND_PATTERN; - r = tune_tree(NODE_ANCHOR_BODY(an), reg, (state|IN_NOT|IN_LOOK_BEHIND), - env); - if (r != 0) return r; - r = tune_look_behind(node, reg, env); - } + r = tune_look_behind(node, reg, state, env); break; default: @@ -5015,7 +5678,6 @@ static int tune_quant(Node* node, regex_t* reg, int state, ScanEnv* env) { int r; - OnigLen d; QuantNode* qn = QUANT_(node); Node* body = NODE_BODY(node); @@ -5027,12 +5689,12 @@ tune_quant(Node* node, regex_t* reg, int state, ScanEnv* env) } if (IS_INFINITE_REPEAT(qn->upper) || qn->upper >= 1) { - d = tree_min_len(body, env); + OnigLen d = node_min_byte_len(body, env); if (d == 0) { #ifdef USE_STUBBORN_CHECK_CAPTURES_IN_EMPTY_REPEAT qn->emptiness = quantifiers_memory_node_info(body); #else - qn->emptiness = BODY_IS_EMPTY_POSSIBILITY; + qn->emptiness = BODY_MAY_BE_EMPTY; #endif } } @@ -5054,7 +5716,7 @@ tune_quant(Node* node, regex_t* reg, int state, ScanEnv* env) if (len * qn->lower <= EXPAND_STRING_MAX_LENGTH) { int i, n = qn->lower; - node_conv_to_str_node(node, STR_(body)->flag); + node_conv_to_str_node(node, body); for (i = 0; i < n; i++) { r = node_str_node_cat(node, body); if (r != 0) return r; @@ -5074,7 +5736,7 @@ tune_quant(Node* node, regex_t* reg, int state, ScanEnv* env) } } else { - qn->head_exact = get_head_value_node(NODE_BODY(node), 1, reg); + qn->head_exact = get_tree_head_literal(NODE_BODY(node), 1, reg); } } @@ -5115,7 +5777,7 @@ tune_tree(Node* node, regex_t* reg, int state, ScanEnv* env) break; case NODE_STRING: - if (IS_IGNORECASE(reg->options) && !NODE_STRING_IS_CRUDE(node)) { + if (NODE_IS_IGNORECASE(node) && ! NODE_STRING_IS_CRUDE(node)) { r = unravel_case_fold_string(node, reg, state); } break; @@ -5299,14 +5961,8 @@ set_sunday_quick_search_or_bmh_skip_table(regex_t* reg, int case_expand, #endif typedef struct { - OnigLen min; /* min byte length */ - OnigLen max; /* max byte length */ -} MinMax; - -typedef struct { - MinMax mm; + MinMaxLen mm; OnigEncoding enc; - OnigOptionType options; OnigCaseFoldType case_fold_flag; ScanEnv* scan_env; } OptEnv; @@ -5317,23 +5973,22 @@ typedef struct { } OptAnc; typedef struct { - MinMax mm; /* position */ + MinMaxLen mm; /* position */ OptAnc anc; int reach_end; - int case_fold; int len; UChar s[OPT_EXACT_MAXLEN]; } OptStr; typedef struct { - MinMax mm; /* position */ + MinMaxLen mm; /* position */ OptAnc anc; int value; /* weighted value */ UChar map[CHAR_MAP_SIZE]; } OptMap; typedef struct { - MinMax len; + MinMaxLen len; OptAnc anc; OptStr sb; /* boundary */ OptStr sm; /* middle */ @@ -5367,7 +6022,7 @@ map_position_value(OnigEncoding enc, int i) } static int -distance_value(MinMax* mm) +distance_value(MinMaxLen* mm) { /* 1000 / (min-max-dist + 1) */ static const short int dist_vals[] = { @@ -5396,7 +6051,7 @@ distance_value(MinMax* mm) } static int -comp_distance_value(MinMax* d1, MinMax* d2, int v1, int v2) +comp_distance_value(MinMaxLen* d1, MinMaxLen* d2, int v1, int v2) { if (v2 <= 0) return -1; if (v1 <= 0) return 1; @@ -5412,46 +6067,6 @@ comp_distance_value(MinMax* d1, MinMax* d2, int v1, int v2) return 0; } -static int -is_equal_mml(MinMax* a, MinMax* b) -{ - return a->min == b->min && a->max == b->max; -} - -static void -set_mml(MinMax* l, OnigLen min, OnigLen max) -{ - l->min = min; - l->max = max; -} - -static void -clear_mml(MinMax* l) -{ - l->min = l->max = 0; -} - -static void -copy_mml(MinMax* to, MinMax* from) -{ - to->min = from->min; - to->max = from->max; -} - -static void -add_mml(MinMax* to, MinMax* from) -{ - to->min = distance_add(to->min, from->min); - to->max = distance_add(to->max, from->max); -} - -static void -alt_merge_mml(MinMax* to, MinMax* from) -{ - if (to->min > from->min) to->min = from->min; - if (to->max < from->max) to->max = from->max; -} - static void copy_opt_env(OptEnv* to, OptEnv* from) { @@ -5543,12 +6158,11 @@ is_full_opt_exact(OptStr* e) static void clear_opt_exact(OptStr* e) { - clear_mml(&e->mm); + mml_clear(&e->mm); clear_opt_anc_info(&e->anc); - e->reach_end = 0; - e->case_fold = 0; - e->len = 0; - e->s[0] = '\0'; + e->reach_end = 0; + e->len = 0; + e->s[0] = '\0'; } static void @@ -5564,14 +6178,6 @@ concat_opt_exact(OptStr* to, OptStr* add, OnigEncoding enc) UChar *p, *end; OptAnc tanc; - if (add->case_fold != 0) { - if (! to->case_fold) { - if (to->len > 1 || to->len >= add->len) return 0; /* avoid */ - - to->case_fold = 1; - } - } - r = 0; p = add->s; end = p + add->len; @@ -5610,7 +6216,7 @@ concat_opt_exact_str(OptStr* to, UChar* s, UChar* end, OnigEncoding enc) to->len = i; - if (p >= end && to->len == (int )(end - s)) + if (p >= end) to->reach_end = 1; } @@ -5624,7 +6230,7 @@ alt_merge_opt_exact(OptStr* to, OptStr* add, OptEnv* env) return ; } - if (! is_equal_mml(&to->mm, &add->mm)) { + if (! mml_is_equal(&to->mm, &add->mm)) { clear_opt_exact(to); return ; } @@ -5644,8 +6250,6 @@ alt_merge_opt_exact(OptStr* to, OptStr* add, OptEnv* env) to->reach_end = 0; } to->len = i; - if (add->case_fold != 0) - to->case_fold = 1; alt_merge_opt_anc_info(&to->anc, &add->anc); if (! to->reach_end) to->anc.right = 0; @@ -5675,8 +6279,8 @@ select_opt_exact(OnigEncoding enc, OptStr* now, OptStr* alt) if (alt->len > 1) va += 5; } - if (now->case_fold == 0) vn *= 2; - if (alt->case_fold == 0) va *= 2; + vn *= 2; + va *= 2; if (comp_distance_value(&now->mm, &alt->mm, vn, va) > 0) copy_opt_exact(now, alt); @@ -5725,28 +6329,6 @@ add_char_opt_map(OptMap* m, UChar c, OnigEncoding enc) } } -static int -add_char_amb_opt_map(OptMap* map, UChar* p, UChar* end, - OnigEncoding enc, OnigCaseFoldType fold_flag) -{ - OnigCaseFoldCodeItem items[ONIGENC_GET_CASE_FOLD_CODES_MAX_NUM]; - UChar buf[ONIGENC_CODE_TO_MBC_MAXLEN]; - int i, n; - - add_char_opt_map(map, p[0], enc); - - fold_flag = DISABLE_CASE_FOLD_MULTI_CHAR(fold_flag); - n = ONIGENC_GET_CASE_FOLD_CODES_BY_STR(enc, fold_flag, p, end, items); - if (n < 0) return n; - - for (i = 0; i < n; i++) { - ONIGENC_CODE_TO_MBC(enc, items[i].code[0], buf); - add_char_opt_map(map, buf[0], enc); - } - - return 0; -} - static void select_opt_map(OptMap* now, OptMap* alt) { @@ -5775,12 +6357,7 @@ comp_opt_exact_or_map(OptStr* e, OptMap* m) if (m->value <= 0) return -1; - if (e->case_fold != 0) { - case_value = 1; - } - else - case_value = 3; - + case_value = 3; ae = COMP_EM_BASE * e->len * case_value; am = COMP_EM_BASE * 5 * 2 / m->value; return comp_distance_value(&e->mm, &m->mm, ae, am); @@ -5791,14 +6368,14 @@ alt_merge_opt_map(OnigEncoding enc, OptMap* to, OptMap* add) { int i, val; - /* if (! is_equal_mml(&to->mm, &add->mm)) return ; */ + /* if (! mml_is_equal(&to->mm, &add->mm)) return ; */ if (to->value == 0) return ; if (add->value == 0 || to->mm.max < add->mm.min) { clear_opt_map(to); return ; } - alt_merge_mml(&to->mm, &add->mm); + mml_alt_merge(&to->mm, &add->mm); val = 0; for (i = 0; i < CHAR_MAP_SIZE; i++) { @@ -5814,17 +6391,17 @@ alt_merge_opt_map(OnigEncoding enc, OptMap* to, OptMap* add) } static void -set_bound_node_opt_info(OptNode* opt, MinMax* plen) +set_bound_node_opt_info(OptNode* opt, MinMaxLen* plen) { - copy_mml(&(opt->sb.mm), plen); - copy_mml(&(opt->spr.mm), plen); - copy_mml(&(opt->map.mm), plen); + mml_copy(&(opt->sb.mm), plen); + mml_copy(&(opt->spr.mm), plen); + mml_copy(&(opt->map.mm), plen); } static void clear_node_opt_info(OptNode* opt) { - clear_mml(&opt->len); + mml_clear(&opt->len); clear_opt_anc_info(&opt->anc); clear_opt_exact(&opt->sb); clear_opt_exact(&opt->sm); @@ -5889,7 +6466,7 @@ concat_left_node_opt_info(OnigEncoding enc, OptNode* to, OptNode* add) } select_opt_map(&to->map, &add->map); - add_mml(&to->len, &add->len); + mml_add(&to->len, &add->len); } static void @@ -5901,7 +6478,7 @@ alt_merge_node_opt_info(OptNode* to, OptNode* add, OptEnv* env) alt_merge_opt_exact(&to->spr, &add->spr, env); alt_merge_opt_map(env->enc, &to->map, &add->map); - alt_merge_mml(&to->len, &add->len); + mml_alt_merge(&to->len, &add->len); } @@ -5930,7 +6507,7 @@ optimize_nodes(Node* node, OptNode* opt, OptEnv* env) do { r = optimize_nodes(NODE_CAR(nd), &xo, &nenv); if (r == 0) { - add_mml(&nenv.mm, &xo.len); + mml_add(&nenv.mm, &xo.len); concat_left_node_opt_info(enc, opt, &xo); } } while (r == 0 && IS_NOT_NULL(nd = NODE_CDR(nd))); @@ -5956,29 +6533,11 @@ optimize_nodes(Node* node, OptNode* opt, OptEnv* env) StrNode* sn = STR_(node); int slen = (int )(sn->end - sn->s); - if (! NODE_STRING_IS_CASE_FOLD_MATCH(node)) { - concat_opt_exact_str(&opt->sb, sn->s, sn->end, enc); - if (slen > 0) { - add_char_opt_map(&opt->map, *(sn->s), enc); - } - set_mml(&opt->len, slen, slen); - } - else { - int max, min; - - concat_opt_exact_str(&opt->sb, sn->s, sn->end, enc); - opt->sb.case_fold = 1; - - if (slen > 0) { - r = add_char_amb_opt_map(&opt->map, sn->s, sn->end, - enc, env->case_fold_flag); - if (r != 0) break; - } - - max = slen; - min = sn->case_min_len * ONIGENC_MBC_MINLEN(enc); - set_mml(&opt->len, min, max); + concat_opt_exact_str(&opt->sb, sn->s, sn->end, enc); + if (slen > 0) { + add_char_opt_map(&opt->map, *(sn->s), enc); } + mml_set_min_max(&opt->len, slen, slen); } break; @@ -5993,7 +6552,7 @@ optimize_nodes(Node* node, OptNode* opt, OptEnv* env) OnigLen min = ONIGENC_MBC_MINLEN(enc); OnigLen max = ONIGENC_MBC_MAXLEN_DIST(enc); - set_mml(&opt->len, min, max); + mml_set_min_max(&opt->len, min, max); } else { for (i = 0; i < SINGLE_BYTE_SIZE; i++) { @@ -6002,7 +6561,7 @@ optimize_nodes(Node* node, OptNode* opt, OptEnv* env) add_char_opt_map(&opt->map, (UChar )i, enc); } } - set_mml(&opt->len, 1, 1); + mml_set_min_max(&opt->len, 1, 1); } } break; @@ -6046,7 +6605,7 @@ optimize_nodes(Node* node, OptNode* opt, OptEnv* env) else { min = ONIGENC_MBC_MINLEN(enc); } - set_mml(&opt->len, min, max); + mml_set_min_max(&opt->len, min, max); } break; @@ -6087,37 +6646,20 @@ optimize_nodes(Node* node, OptNode* opt, OptEnv* env) case NODE_BACKREF: if (! NODE_IS_CHECKER(node)) { - int* backs; - OnigLen min, max, tmin, tmax; - MemEnv* mem_env = SCANENV_MEMENV(env->scan_env); - BackRefNode* br = BACKREF_(node); + OnigLen min, max; - if (NODE_IS_RECURSION(node)) { - set_mml(&opt->len, 0, INFINITE_LEN); - break; - } - backs = BACKREFS_P(br); - min = tree_min_len(mem_env[backs[0]].mem_node, env->scan_env); - max = tree_max_len(mem_env[backs[0]].mem_node, env->scan_env); - for (i = 1; i < br->back_num; i++) { - tmin = tree_min_len(mem_env[backs[i]].mem_node, env->scan_env); - tmax = tree_max_len(mem_env[backs[i]].mem_node, env->scan_env); - if (min > tmin) min = tmin; - if (max < tmax) max = tmax; - } - set_mml(&opt->len, min, max); + min = node_min_byte_len(node, env->scan_env); + max = node_max_byte_len(node, env->scan_env); + mml_set_min_max(&opt->len, min, max); } break; #ifdef USE_CALL case NODE_CALL: if (NODE_IS_RECURSION(node)) - set_mml(&opt->len, 0, INFINITE_LEN); + mml_set_min_max(&opt->len, 0, INFINITE_LEN); else { - OnigOptionType save = env->options; - env->options = BAG_(NODE_BODY(node))->o.options; r = optimize_nodes(NODE_BODY(node), opt, env); - env->options = save; } break; #endif @@ -6127,6 +6669,20 @@ optimize_nodes(Node* node, OptNode* opt, OptEnv* env) OnigLen min, max; QuantNode* qn = QUANT_(node); + /* Issue #175 + ex. /\g<1>{0}(?<=|())/ + + Empty and unused nodes in look-behind is removed in + tune_look_behind(). + Called group nodes are assigned to be not called if the caller side is + inside of zero-repetition. + As a result, the nodes are considered unused. + */ + if (qn->upper == 0) { + mml_set_min_max(&opt->len, 0, 0); + break; + } + r = optimize_nodes(NODE_BODY(node), &xo, env); if (r != 0) break; @@ -6153,7 +6709,7 @@ optimize_nodes(Node* node, OptNode* opt, OptEnv* env) if (IS_INFINITE_REPEAT(qn->upper)) { if (env->mm.max == 0 && NODE_IS_ANYCHAR(NODE_BODY(node)) && qn->greedy != 0) { - if (IS_MULTILINE(CTYPE_OPTION(NODE_QUANT_BODY(qn), env))) + if (NODE_IS_MULTILINE(NODE_QUANT_BODY(qn))) add_opt_anc_info(&opt->anc, ANCR_ANYCHAR_INF_ML); else add_opt_anc_info(&opt->anc, ANCR_ANYCHAR_INF); @@ -6166,7 +6722,7 @@ optimize_nodes(Node* node, OptNode* opt, OptEnv* env) } min = distance_multiply(xo.len.min, qn->lower); - set_mml(&opt->len, min, max); + mml_set_min_max(&opt->len, min, max); } break; @@ -6175,14 +6731,9 @@ optimize_nodes(Node* node, OptNode* opt, OptEnv* env) BagNode* en = BAG_(node); switch (en->type) { + case BAG_STOP_BACKTRACK: case BAG_OPTION: - { - OnigOptionType save = env->options; - - env->options = en->o.options; - r = optimize_nodes(NODE_BODY(node), opt, env); - env->options = save; - } + r = optimize_nodes(NODE_BODY(node), opt, env); break; case BAG_MEMORY: @@ -6193,9 +6744,9 @@ optimize_nodes(Node* node, OptNode* opt, OptEnv* env) min = 0; max = INFINITE_LEN; - if (NODE_IS_MIN_FIXED(node)) min = en->min_len; - if (NODE_IS_MAX_FIXED(node)) max = en->max_len; - set_mml(&opt->len, min, max); + if (NODE_IS_FIXED_MIN(node)) min = en->min_len; + if (NODE_IS_FIXED_MAX(node)) max = en->max_len; + mml_set_min_max(&opt->len, min, max); } else #endif @@ -6208,10 +6759,6 @@ optimize_nodes(Node* node, OptNode* opt, OptEnv* env) } break; - case BAG_STOP_BACKTRACK: - r = optimize_nodes(NODE_BODY(node), opt, env); - break; - case BAG_IF_ELSE: { OptEnv nenv; @@ -6219,7 +6766,7 @@ optimize_nodes(Node* node, OptNode* opt, OptEnv* env) copy_opt_env(&nenv, env); r = optimize_nodes(NODE_BAG_BODY(en), &xo, &nenv); if (r == 0) { - add_mml(&nenv.mm, &xo.len); + mml_add(&nenv.mm, &xo.len); concat_left_node_opt_info(enc, opt, &xo); if (IS_NOT_NULL(en->te.Then)) { r = optimize_nodes(en->te.Then, &xo, &nenv); @@ -6245,7 +6792,7 @@ optimize_nodes(Node* node, OptNode* opt, OptEnv* env) default: #ifdef ONIG_DEBUG - fprintf(stderr, "optimize_nodes: undefined node type %d\n", NODE_TYPE(node)); + fprintf(DBGFP, "optimize_nodes: undefined node type %d\n", NODE_TYPE(node)); #endif r = ONIGERR_TYPE_BUG; break; @@ -6258,6 +6805,7 @@ static int set_optimize_exact(regex_t* reg, OptStr* e) { int r; + int allow_reverse; if (e->len == 0) return 0; @@ -6266,40 +6814,28 @@ set_optimize_exact(regex_t* reg, OptStr* e) xmemcpy(reg->exact, e->s, e->len); reg->exact_end = reg->exact + e->len; - if (e->case_fold) { - reg->optimize = OPTIMIZE_STR_CASE_FOLD; - } - else { - int allow_reverse; + allow_reverse = + ONIGENC_IS_ALLOWED_REVERSE_MATCH(reg->enc, reg->exact, reg->exact_end); - allow_reverse = - ONIGENC_IS_ALLOWED_REVERSE_MATCH(reg->enc, reg->exact, reg->exact_end); - - if (e->len >= 2 || (e->len >= 1 && allow_reverse)) { - r = set_sunday_quick_search_or_bmh_skip_table(reg, 0, - reg->exact, reg->exact_end, - reg->map, &(reg->map_offset)); - if (r != 0) return r; + if (e->len >= 2 || (e->len >= 1 && allow_reverse)) { + r = set_sunday_quick_search_or_bmh_skip_table(reg, 0, + reg->exact, reg->exact_end, + reg->map, &(reg->map_offset)); + if (r != 0) return r; - reg->optimize = (allow_reverse != 0 - ? OPTIMIZE_STR_FAST - : OPTIMIZE_STR_FAST_STEP_FORWARD); - } - else { - reg->optimize = OPTIMIZE_STR; - } + reg->optimize = (allow_reverse != 0 + ? OPTIMIZE_STR_FAST + : OPTIMIZE_STR_FAST_STEP_FORWARD); + } + else { + reg->optimize = OPTIMIZE_STR; } reg->dist_min = e->mm.min; reg->dist_max = e->mm.max; if (reg->dist_min != INFINITE_LEN) { - int n; - if (e->case_fold != 0) - n = 1; - else - n = (int )(reg->exact_end - reg->exact); - + int n = (int )(reg->exact_end - reg->exact); reg->threshold_len = reg->dist_min + n; } @@ -6319,7 +6855,7 @@ set_optimize_map(regex_t* reg, OptMap* m) reg->dist_max = m->mm.max; if (reg->dist_min != INFINITE_LEN) { - reg->threshold_len = reg->dist_min + 1; + reg->threshold_len = reg->dist_min + ONIGENC_MBC_MINLEN(reg->enc); } } @@ -6342,10 +6878,9 @@ set_optimize_info_from_tree(Node* node, regex_t* reg, ScanEnv* scan_env) OptEnv env; env.enc = reg->enc; - env.options = reg->options; env.case_fold_flag = reg->case_fold_flag; env.scan_env = scan_env; - clear_mml(&env.mm); + mml_clear(&env.mm); r = optimize_nodes(node, &opt, &env); if (r != 0) return r; @@ -6387,7 +6922,7 @@ set_optimize_info_from_tree(Node* node, regex_t* reg, ScanEnv* scan_env) } #if defined(ONIG_DEBUG_COMPILE) || defined(ONIG_DEBUG_MATCH) - print_optimize_info(stderr, reg); + print_optimize_info(DBGFP, reg); #endif return r; } @@ -6414,8 +6949,6 @@ clear_optimize_info(regex_t* reg) static void print_enc_string(FILE* fp, OnigEncoding enc, const UChar *s, const UChar *end) { - fprintf(fp, "\nPATTERN: /"); - if (ONIGENC_MBC_MINLEN(enc) > 1) { const UChar *p; OnigCodePoint code; @@ -6515,9 +7048,8 @@ print_anchor(FILE* f, int anchor) static void print_optimize_info(FILE* f, regex_t* reg) { - static const char* on[] = { "NONE", "STR", - "STR_FAST", "STR_FAST_STEP_FORWARD", - "STR_CASE_FOLD", "MAP" }; + static const char* on[] = + { "NONE", "STR", "STR_FAST", "STR_FAST_STEP_FORWARD", "MAP" }; fprintf(f, "optimize: %s\n", on[reg->optimize]); fprintf(f, " anchor: "); print_anchor(f, reg->anchor); @@ -6537,7 +7069,12 @@ print_optimize_info(FILE* f, regex_t* reg) for (p = reg->exact; p < reg->exact_end; p++) { fputc(*p, f); } - fprintf(f, "]: length: %ld\n", (reg->exact_end - reg->exact)); + fprintf(f, "]: length: %ld, dmin: %u, ", + (reg->exact_end - reg->exact), reg->dist_min); + if (reg->dist_max == INFINITE_LEN) + fprintf(f, "dmax: inf.\n"); + else + fprintf(f, "dmax: %u\n", reg->dist_max); } else if (reg->optimize & OPTIMIZE_MAP) { int c, i, n = 0; @@ -6545,7 +7082,8 @@ print_optimize_info(FILE* f, regex_t* reg) for (i = 0; i < CHAR_MAP_SIZE; i++) if (reg->map[i]) n++; - fprintf(f, "map: n=%d\n", n); + fprintf(f, "map: n=%d, dmin: %u, dmax: %u\n", + n, reg->dist_min, reg->dist_max); if (n > 0) { c = 0; fputc('[', f); @@ -6680,7 +7218,8 @@ onig_compile(regex_t* reg, const UChar* pattern, const UChar* pattern_end, } #ifdef ONIG_DEBUG - print_enc_string(stderr, reg->enc, pattern, pattern_end); + fprintf(DBGFP, "\nPATTERN: /"); + print_enc_string(DBGFP, reg->enc, pattern, pattern_end); #endif if (reg->ops_alloc == 0) { @@ -6708,7 +7247,7 @@ onig_compile(regex_t* reg, const UChar* pattern, const UChar* pattern_end, /* mixed use named group and no-named group */ if (scan_env.num_named > 0 && IS_SYNTAX_BV(scan_env.syntax, ONIG_SYN_CAPTURE_ONLY_NAMED_GROUP) && - ! ONIG_IS_OPTION_ON(reg->options, ONIG_OPTION_CAPTURE_GROUP)) { + ! OPTON_CAPTURE_GROUP(reg->options)) { if (scan_env.num_named != scan_env.num_mem) r = disable_noname_group_capture(&root, reg, &scan_env); else @@ -6741,10 +7280,10 @@ onig_compile(regex_t* reg, const UChar* pattern, const UChar* pattern_end, #endif #ifdef ONIG_DEBUG_PARSE - fprintf(stderr, "MAX PARSE DEPTH: %d\n", scan_env.max_parse_depth); - fprintf(stderr, "TREE (parsed)\n"); - print_tree(stderr, root); - fprintf(stderr, "\n"); + fprintf(DBGFP, "MAX PARSE DEPTH: %d\n", scan_env.max_parse_depth); + fprintf(DBGFP, "TREE (parsed)\n"); + print_tree(DBGFP, root); + fprintf(DBGFP, "\n"); #endif r = tune_tree(root, reg, 0, &scan_env); @@ -6758,13 +7297,13 @@ onig_compile(regex_t* reg, const UChar* pattern, const UChar* pattern_end, } #ifdef ONIG_DEBUG_PARSE - fprintf(stderr, "TREE (after tune)\n"); - print_tree(stderr, root); - fprintf(stderr, "\n"); + fprintf(DBGFP, "TREE (after tune)\n"); + print_tree(DBGFP, root); + fprintf(DBGFP, "\n"); #endif - reg->capture_history = scan_env.cap_history; - reg->push_mem_start = scan_env.backtrack_mem | scan_env.cap_history; + reg->capture_history = scan_env.cap_history; + reg->push_mem_start = scan_env.backtrack_mem | scan_env.cap_history; #ifdef USE_CALLOUT if (IS_NOT_NULL(reg->extp) && reg->extp->callout_num != 0) { @@ -6804,6 +7343,7 @@ onig_compile(regex_t* reg, const UChar* pattern, const UChar* pattern_end, COP(reg)->update_var.type = UPDATE_VAR_KEEP_FROM_STACK_LAST; COP(reg)->update_var.id = 0; /* not used */ + COP(reg)->update_var.clear = FALSE; } r = add_op(reg, OP_END); @@ -6827,6 +7367,9 @@ onig_compile(regex_t* reg, const UChar* pattern, const UChar* pattern_end, #ifdef USE_CALLOUT || (IS_NOT_NULL(reg->extp) && reg->extp->callout_num != 0) #endif +#ifdef USE_CALL + || scan_env.num_call > 0 +#endif ) reg->stack_pop_level = STACK_POP_LEVEL_ALL; else { @@ -6847,8 +7390,8 @@ onig_compile(regex_t* reg, const UChar* pattern, const UChar* pattern_end, onig_node_free(root); #ifdef ONIG_DEBUG_COMPILE - onig_print_names(stderr, reg); - onig_print_compiled_byte_code_list(stderr, reg); + onig_print_names(DBGFP, reg); + onig_print_compiled_byte_code_list(DBGFP, reg); #endif #ifdef USE_DIRECT_THREADED_CODE @@ -6920,20 +7463,18 @@ onig_reg_init(regex_t* reg, OnigOptionType option, OnigCaseFoldType case_fold_fl else option |= syntax->options; - (reg)->enc = enc; - (reg)->options = option; - (reg)->syntax = syntax; - (reg)->optimize = 0; - (reg)->exact = (UChar* )NULL; - (reg)->extp = (RegexExt* )NULL; - - (reg)->ops = (Operation* )NULL; - (reg)->ops_curr = (Operation* )NULL; - (reg)->ops_used = 0; - (reg)->ops_alloc = 0; - (reg)->name_table = (void* )NULL; - - (reg)->case_fold_flag = case_fold_flag; + (reg)->enc = enc; + (reg)->options = option; + (reg)->syntax = syntax; + (reg)->optimize = 0; + (reg)->exact = (UChar* )NULL; + (reg)->extp = (RegexExt* )NULL; + (reg)->ops = (Operation* )NULL; + (reg)->ops_curr = (Operation* )NULL; + (reg)->ops_used = 0; + (reg)->ops_alloc = 0; + (reg)->name_table = (void* )NULL; + (reg)->case_fold_flag = case_fold_flag; return 0; } @@ -7171,8 +7712,8 @@ print_indent_tree(FILE* f, Node* node, int indent) if (NODE_STRING_IS_CRUDE(node)) mode = "-crude"; - else if (NODE_STRING_IS_CASE_FOLD_MATCH(node)) - mode = "-case_fold_match"; + else if (NODE_IS_IGNORECASE(node)) + mode = "-ignorecase"; else mode = ""; @@ -7208,7 +7749,7 @@ print_indent_tree(FILE* f, Node* node, int indent) fprintf(f, "<ctype:%p> ", node); switch (CTYPE_(node)->ctype) { case CTYPE_ANYCHAR: - fprintf(f, "<anychar:%p>", node); + fprintf(f, "anychar"); break; case ONIGENC_CTYPE_WORD: @@ -7295,9 +7836,10 @@ print_indent_tree(FILE* f, Node* node, int indent) #endif case NODE_QUANT: - fprintf(f, "<quantifier:%p>{%d,%d}%s\n", node, + fprintf(f, "<quantifier:%p>{%d,%d}%s%s\n", node, QUANT_(node)->lower, QUANT_(node)->upper, - (QUANT_(node)->greedy ? "" : "?")); + (QUANT_(node)->greedy ? "" : "?"), + QUANT_(node)->include_referred == 0 ? "" : " referred"); print_indent_tree(f, NODE_BODY(node), indent + add); break; @@ -7337,6 +7879,10 @@ print_indent_tree(FILE* f, Node* node, int indent) break; case BAG_MEMORY: fprintf(f, "memory:%d", BAG_(node)->m.regnum); + if (NODE_IS_CALLED(node)) + fprintf(f, ", called"); + if (NODE_IS_FIXED_ADDR(node)) + fprintf(f, ", fixed-addr"); break; case BAG_STOP_BACKTRACK: fprintf(f, "stop-bt"); |