diff options
author | Jörg Frings-Fürst <debian@jff.email> | 2019-11-29 11:26:57 +0100 |
---|---|---|
committer | Jörg Frings-Fürst <debian@jff.email> | 2019-11-29 11:26:57 +0100 |
commit | 7f4e90f2759d6a15812172ee19f3ad5b58940beb (patch) | |
tree | 5f90c63b8ba73f4ecd23d6e642c1ab34dccea033 /src/regparse.c | |
parent | 68d1ec60c90d27c511d51ce0bef44b132a7ddf11 (diff) | |
parent | 7e149a97d276ce3b4c5e34f965766c8e40e03fef (diff) |
Merge branch 'feature/upstream' into develop
Diffstat (limited to 'src/regparse.c')
-rw-r--r-- | src/regparse.c | 968 |
1 files changed, 514 insertions, 454 deletions
diff --git a/src/regparse.c b/src/regparse.c index 7f8b1a9..fed53f7 100644 --- a/src/regparse.c +++ b/src/regparse.c @@ -2,7 +2,7 @@ regparse.c - Oniguruma (regular expression library) **********************************************************************/ /*- - * Copyright (c) 2002-2019 K.Kosako <sndgk393 AT ybb DOT ne DOT jp> + * Copyright (c) 2002-2019 K.Kosako * All rights reserved. * * Redistribution and use in source and binary forms, with or without @@ -199,6 +199,24 @@ onig_set_parse_depth_limit(unsigned int depth) return 0; } +#ifdef ONIG_DEBUG_PARSE +#define INC_PARSE_DEPTH(d) do {\ + (d)++;\ + if (env->max_parse_depth < (d)) env->max_parse_depth = d;\ + if ((d) > ParseDepthLimit) \ + return ONIGERR_PARSE_DEPTH_LIMIT_OVER;\ +} while (0) +#else +#define INC_PARSE_DEPTH(d) do {\ + (d)++;\ + if ((d) > ParseDepthLimit) \ + return ONIGERR_PARSE_DEPTH_LIMIT_OVER;\ +} while (0) +#endif + +#define DEC_PARSE_DEPTH(d) (d)-- + + static int bbuf_init(BBuf* buf, int size) { @@ -244,7 +262,8 @@ bbuf_clone(BBuf** rto, BBuf* from) return 0; } -static int backref_rel_to_abs(int rel_no, ScanEnv* env) +static int +backref_rel_to_abs(int rel_no, ScanEnv* env) { if (rel_no > 0) { return env->num_mem + rel_no; @@ -292,15 +311,6 @@ bitset_set_range(BitSetRef bs, int from, int to) } } -#if 0 -static void -bitset_set_all(BitSetRef bs) -{ - int i; - for (i = 0; i < BITSET_SIZE; i++) { bs[i] = ~((Bits )0); } -} -#endif - static void bitset_invert(BitSetRef bs) { @@ -363,24 +373,6 @@ save_entry(ScanEnv* env, enum SaveType type, int* id) { int nid = env->save_num; -#if 0 - if (IS_NULL(env->saves)) { - int n = 10; - env->saves = (SaveItem* )xmalloc(sizeof(SaveItem) * n); - CHECK_NULL_RETURN_MEMERR(env->saves); - env->save_alloc_num = n; - } - else if (env->save_alloc_num <= nid) { - int n = env->save_alloc_num * 2; - SaveItem* p = (SaveItem* )xrealloc(env->saves, sizeof(SaveItem) * n); - CHECK_NULL_RETURN_MEMERR(p); - env->saves = p; - env->save_alloc_num = n; - } - - env->saves[nid].type = type; -#endif - env->save_num++; *id = nid; return 0; @@ -476,14 +468,14 @@ static int str_end_hash(st_str_end_key* x) { UChar *p; - int val = 0; + unsigned val = 0; p = x->s; while (p < x->end) { - val = val * 997 + (int )*p++; + val = val * 997 + (unsigned )*p++; } - return val + (val >> 5); + return (int) (val + (val >> 5)); } extern hash_table_type* @@ -566,15 +558,15 @@ static int callout_name_table_hash(st_callout_name_key* x) { UChar *p; - int val = 0; + unsigned int val = 0; p = x->s; while (p < x->end) { - val = val * 997 + (int )*p++; + val = val * 997 + (unsigned int )*p++; } /* use intptr_t for escape warning in Windows */ - return val + (val >> 5) + ((intptr_t )x->enc & 0xffff) + x->type; + return (int )(val + (val >> 5) + ((intptr_t )x->enc & 0xffff) + x->type); } extern hash_table_type* @@ -1972,9 +1964,8 @@ callout_tag_entry(ScanEnv* env, regex_t* reg, UChar* name, UChar* name_end, static void scan_env_clear(ScanEnv* env) { - MEM_STATUS_CLEAR(env->capture_history); - MEM_STATUS_CLEAR(env->bt_mem_start); - MEM_STATUS_CLEAR(env->bt_mem_end); + MEM_STATUS_CLEAR(env->cap_history); + MEM_STATUS_CLEAR(env->backtrack_mem); MEM_STATUS_CLEAR(env->backrefed_mem); env->error = (UChar* )NULL; env->error_end = (UChar* )NULL; @@ -1993,6 +1984,10 @@ scan_env_clear(ScanEnv* env) xmemset(env->mem_env_static, 0, sizeof(env->mem_env_static)); env->parse_depth = 0; +#ifdef ONIG_DEBUG_PARSE + env->max_parse_depth = 0; +#endif + env->backref_num = 0; env->keep_num = 0; env->save_num = 0; env->save_alloc_num = 0; @@ -2024,11 +2019,8 @@ scan_env_add_mem_entry(ScanEnv* env) } for (i = env->num_mem + 1; i < alloc; i++) { - p[i].node = NULL_NODE; -#if 0 - p[i].in = 0; - p[i].recursion = 0; -#endif + p[i].mem_node = NULL_NODE; + p[i].empty_repeat_node = NULL_NODE; } env->mem_env_dynamic = p; @@ -2044,7 +2036,7 @@ static int scan_env_set_mem_node(ScanEnv* env, int num, Node* node) { if (env->num_mem >= num) - SCANENV_MEMENV(env)[num].node = node; + SCANENV_MEMENV(env)[num].mem_node = node; else return ONIGERR_PARSER_BUG; return 0; @@ -2182,7 +2174,7 @@ node_new_ctype(int type, int not, OnigOptionType options) static Node* node_new_anychar(void) { - Node* node = node_new_ctype(CTYPE_ANYCHAR, 0, ONIG_OPTION_NONE); + Node* node = node_new_ctype(CTYPE_ANYCHAR, FALSE, ONIG_OPTION_NONE); return node; } @@ -2242,24 +2234,6 @@ onig_node_new_list(Node* left, Node* right) } extern Node* -onig_node_list_add(Node* list, Node* x) -{ - Node *n; - - n = onig_node_new_list(x, NULL); - if (IS_NULL(n)) return NULL_NODE; - - if (IS_NOT_NULL(list)) { - while (IS_NOT_NULL(NODE_CDR(list))) - list = NODE_CDR(list); - - NODE_CDR(list) = n; - } - - return n; -} - -extern Node* onig_node_new_alt(Node* left, Node* right) { Node* node = node_new(); @@ -2357,7 +2331,7 @@ node_new_backref(int back_num, int* backrefs, int by_name, for (i = 0; i < back_num; i++) { if (backrefs[i] <= env->num_mem && - IS_NULL(SCANENV_MEMENV(env)[backrefs[i]].node)) { + IS_NULL(SCANENV_MEMENV(env)[backrefs[i]].mem_node)) { NODE_STATUS_ADD(node, RECURSION); /* /...(\1).../ */ break; } @@ -2377,6 +2351,8 @@ node_new_backref(int back_num, int* backrefs, int by_name, for (i = 0; i < back_num; i++) p[i] = backrefs[i]; } + + env->backref_num++; return node; } @@ -2424,13 +2400,13 @@ node_new_quantifier(int lower, int upper, int by_number) CHECK_NULL_RETURN(node); NODE_SET_TYPE(node, NODE_QUANT); - QUANT_(node)->lower = lower; - QUANT_(node)->upper = upper; - QUANT_(node)->greedy = 1; - QUANT_(node)->emptiness = BODY_IS_NOT_EMPTY; - QUANT_(node)->head_exact = NULL_NODE; - QUANT_(node)->next_head_exact = NULL_NODE; - QUANT_(node)->is_refered = 0; + QUANT_(node)->lower = lower; + QUANT_(node)->upper = upper; + QUANT_(node)->greedy = 1; + QUANT_(node)->emptiness = BODY_IS_NOT_EMPTY; + QUANT_(node)->head_exact = NULL_NODE; + QUANT_(node)->next_head_exact = NULL_NODE; + QUANT_(node)->include_referred = 0; if (by_number != 0) NODE_STATUS_ADD(node, BY_NUMBER); @@ -2716,7 +2692,7 @@ make_text_segment(Node** node, ScanEnv* env) ns[1] = NULL_NODE; r = ONIGERR_MEMORY; - ns[0] = onig_node_new_anchor(ANCR_NO_TEXT_SEGMENT_BOUNDARY, 0); + ns[0] = onig_node_new_anchor(ANCR_NO_TEXT_SEGMENT_BOUNDARY, FALSE); if (IS_NULL(ns[0])) goto err; r = node_new_true_anychar(&ns[1], env); @@ -2727,7 +2703,7 @@ make_text_segment(Node** node, ScanEnv* env) ns[0] = x; ns[1] = NULL_NODE; - x = node_new_quantifier(0, INFINITE_REPEAT, 1); + x = node_new_quantifier(0, INFINITE_REPEAT, TRUE); if (IS_NULL(x)) goto err; NODE_BODY(x) = ns[0]; @@ -2796,7 +2772,7 @@ make_absent_engine(Node** node, int pre_save_right_id, Node* absent, ns[0] = x; - x = node_new_quantifier(lower, upper, 0); + x = node_new_quantifier(lower, upper, FALSE); if (IS_NULL(x)) goto err0; NODE_BODY(x) = ns[0]; @@ -2825,7 +2801,7 @@ make_absent_engine(Node** node, int pre_save_right_id, Node* absent, x = make_alt(2, ns); if (IS_NULL(x)) goto err0; - if (is_range_cutter != 0) + if (is_range_cutter != FALSE) NODE_STATUS_ADD(x, SUPER); *node = x; @@ -2915,7 +2891,10 @@ make_range_clear(Node** node, ScanEnv* env) ns[0] = NULL_NODE; ns[1] = x; - r = node_new_update_var_gimmick(&ns[0], UPDATE_VAR_RIGHT_RANGE_INIT, 0, env); +#define ID_NOT_USED_DONT_CARE_ME 0 + + r = node_new_update_var_gimmick(&ns[0], UPDATE_VAR_RIGHT_RANGE_INIT, + ID_NOT_USED_DONT_CARE_ME, env); if (r != 0) goto err; x = make_alt(2, ns); @@ -3034,7 +3013,7 @@ make_absent_tree_for_simple_one_char_repeat(Node** node, Node* absent, Node* qua id1 = GIMMICK_(ns[0])->id; r = make_absent_engine(&ns[1], id1, absent, body, lower, upper, possessive, - 0, env); + FALSE, env); if (r != 0) goto err; ns[2] = ns[3] = NULL_NODE; @@ -3077,7 +3056,7 @@ make_absent_tree(Node** node, Node* absent, Node* expr, int is_range_cutter, if (expr == NULL_NODE) { /* default expr \O* */ - quant = node_new_quantifier(0, INFINITE_REPEAT, 0); + quant = node_new_quantifier(0, INFINITE_REPEAT, FALSE); if (IS_NULL(quant)) goto err0; r = node_new_true_anychar(&body, env); @@ -3204,16 +3183,6 @@ node_str_cat_char(Node* node, UChar c) } extern void -onig_node_conv_to_str_node(Node* node, int flag) -{ - NODE_SET_TYPE(node, NODE_STRING); - STR_(node)->flag = flag; - STR_(node)->capacity = 0; - STR_(node)->s = STR_(node)->buf; - STR_(node)->end = STR_(node)->buf; -} - -extern void onig_node_str_clear(Node* node) { if (STR_(node)->capacity != 0 && @@ -3221,10 +3190,11 @@ onig_node_str_clear(Node* node) xfree(STR_(node)->s); } - STR_(node)->capacity = 0; STR_(node)->flag = 0; STR_(node)->s = STR_(node)->buf; STR_(node)->end = STR_(node)->buf; + STR_(node)->capacity = 0; + STR_(node)->case_min_len = 0; } static Node* @@ -3234,10 +3204,12 @@ node_new_str(const UChar* s, const UChar* end) CHECK_NULL_RETURN(node); NODE_SET_TYPE(node, NODE_STRING); - STR_(node)->capacity = 0; STR_(node)->flag = 0; STR_(node)->s = STR_(node)->buf; STR_(node)->end = STR_(node)->buf; + STR_(node)->capacity = 0; + STR_(node)->case_min_len = 0; + if (onig_node_str_cat(node, s, end)) { onig_node_free(node); return NULL; @@ -3252,11 +3224,11 @@ onig_node_new_str(const UChar* s, const UChar* end) } static Node* -node_new_str_raw(UChar* s, UChar* end) +node_new_str_crude(UChar* s, UChar* end) { Node* node = node_new_str(s, end); CHECK_NULL_RETURN(node); - NODE_STRING_SET_RAW(node); + NODE_STRING_SET_CRUDE(node); return node; } @@ -3267,14 +3239,14 @@ node_new_empty(void) } static Node* -node_new_str_raw_char(UChar c) +node_new_str_crude_char(UChar c) { int i; UChar p[1]; Node* node; p[0] = c; - node = node_new_str_raw(p, p + 1); + node = node_new_str_crude(p, p + 1); /* clear buf tail */ for (i = 1; i < NODE_STRING_BUF_SIZE; i++) @@ -3297,8 +3269,8 @@ str_node_split_last_char(Node* node, OnigEncoding enc) if (p && p > sn->s) { /* can be split. */ rn = node_new_str(p, sn->end); CHECK_NULL_RETURN(rn); - if (NODE_STRING_IS_RAW(node)) - NODE_STRING_SET_RAW(rn); + if (NODE_STRING_IS_CRUDE(node)) + NODE_STRING_SET_CRUDE(rn); sn->end = (UChar* )p; } @@ -3316,10 +3288,10 @@ str_node_can_be_split(Node* node, OnigEncoding enc) return 0; } -extern int -onig_scan_unsigned_number(UChar** src, const UChar* end, OnigEncoding enc) +static int +scan_number(UChar** src, const UChar* end, OnigEncoding enc) { - unsigned int num, val; + int num, val; OnigCodePoint c; UChar* p = *src; PFETCH_READY; @@ -3328,8 +3300,8 @@ onig_scan_unsigned_number(UChar** src, const UChar* end, OnigEncoding enc) while (! PEND) { PFETCH(c); if (IS_CODE_DIGIT_ASCII(enc, c)) { - val = (unsigned int )DIGITVAL(c); - if ((INT_MAX_LIMIT - val) / 10UL < num) + val = (int )DIGITVAL(c); + if ((INT_MAX - val) / 10 < num) return -1; /* overflow */ num = num * 10 + val; @@ -3344,26 +3316,27 @@ onig_scan_unsigned_number(UChar** src, const UChar* end, OnigEncoding enc) } static int -scan_unsigned_hexadecimal_number(UChar** src, UChar* end, int minlen, - int maxlen, OnigEncoding enc) +scan_hexadecimal_number(UChar** src, UChar* end, int minlen, int maxlen, + OnigEncoding enc, OnigCodePoint* rcode) { + OnigCodePoint code; OnigCodePoint c; - unsigned int num, val; + unsigned int val; int n; UChar* p = *src; PFETCH_READY; - num = 0; + code = 0; n = 0; while (! PEND && n < maxlen) { PFETCH(c); if (IS_CODE_XDIGIT_ASCII(enc, c)) { n++; - val = (unsigned int )XDIGITVAL(enc,c); - if ((INT_MAX_LIMIT - val) / 16UL < num) + val = (unsigned int )XDIGITVAL(enc, c); + if ((UINT_MAX - val) / 16UL < code) return ONIGERR_TOO_BIG_NUMBER; /* overflow */ - num = (num << 4) + XDIGITVAL(enc,c); + code = (code << 4) + val; } else { PUNFETCH; @@ -3374,36 +3347,46 @@ scan_unsigned_hexadecimal_number(UChar** src, UChar* end, int minlen, if (n < minlen) return ONIGERR_INVALID_CODE_POINT_VALUE; + *rcode = code; *src = p; - return num; + return ONIG_NORMAL; } static int -scan_unsigned_octal_number(UChar** src, UChar* end, int maxlen, - OnigEncoding enc) +scan_octal_number(UChar** src, UChar* end, int minlen, int maxlen, + OnigEncoding enc, OnigCodePoint* rcode) { + OnigCodePoint code; OnigCodePoint c; - unsigned int num, val; + unsigned int val; + int n; UChar* p = *src; PFETCH_READY; - num = 0; - while (! PEND && maxlen-- != 0) { + code = 0; + n = 0; + while (! PEND && n < maxlen) { PFETCH(c); if (IS_CODE_DIGIT_ASCII(enc, c) && c < '8') { - val = ODIGITVAL(c); - if ((INT_MAX_LIMIT - val) / 8UL < num) - return -1; /* overflow */ + n++; + val = (unsigned int )ODIGITVAL(c); + if ((UINT_MAX - val) / 8UL < code) + return ONIGERR_TOO_BIG_NUMBER; /* overflow */ - num = (num << 3) + val; + code = (code << 3) + val; } else { PUNFETCH; break; } } + + if (n < minlen) + return ONIGERR_INVALID_CODE_POINT_VALUE; + + *rcode = code; *src = p; - return num; + return ONIG_NORMAL; } @@ -3938,68 +3921,70 @@ static enum ReduceType ReduceTypeTable[6][6] = { {RQ_ASIS, RQ_PQ_Q, RQ_DEL, RQ_AQ, RQ_AQ, RQ_DEL} /* '+?' */ }; -extern void -onig_reduce_nested_quantifier(Node* pnode, Node* cnode) +extern int +onig_reduce_nested_quantifier(Node* pnode) { int pnum, cnum; QuantNode *p, *c; + Node* cnode; + + cnode = NODE_BODY(pnode); p = QUANT_(pnode); c = QUANT_(cnode); pnum = quantifier_type_num(p); cnum = quantifier_type_num(c); if (pnum < 0 || cnum < 0) { - if ((p->lower == p->upper) && ! IS_INFINITE_REPEAT(p->upper)) { - if ((c->lower == c->upper) && ! IS_INFINITE_REPEAT(c->upper)) { - int n = onig_positive_int_multiply(p->lower, c->lower); - if (n >= 0) { - p->lower = p->upper = n; - NODE_BODY(pnode) = NODE_BODY(cnode); - goto remove_cnode; - } - } + if (p->lower == p->upper && c->lower == c->upper) { + int n = onig_positive_int_multiply(p->lower, c->lower); + if (n < 0) return ONIGERR_TOO_BIG_NUMBER_FOR_REPEAT_RANGE; + + p->lower = p->upper = n; + NODE_BODY(pnode) = NODE_BODY(cnode); + goto remove_cnode; } - return ; + return 0; } switch(ReduceTypeTable[cnum][pnum]) { case RQ_DEL: *pnode = *cnode; + goto remove_cnode; break; case RQ_A: NODE_BODY(pnode) = NODE_BODY(cnode); p->lower = 0; p->upper = INFINITE_REPEAT; p->greedy = 1; + goto remove_cnode; break; case RQ_AQ: NODE_BODY(pnode) = NODE_BODY(cnode); p->lower = 0; p->upper = INFINITE_REPEAT; p->greedy = 0; + goto remove_cnode; break; case RQ_QQ: NODE_BODY(pnode) = NODE_BODY(cnode); p->lower = 0; p->upper = 1; p->greedy = 0; + goto remove_cnode; break; case RQ_P_QQ: - NODE_BODY(pnode) = cnode; p->lower = 0; p->upper = 1; p->greedy = 0; c->lower = 1; c->upper = INFINITE_REPEAT; c->greedy = 1; - return ; break; case RQ_PQ_Q: - NODE_BODY(pnode) = cnode; p->lower = 0; p->upper = 1; p->greedy = 1; c->lower = 1; c->upper = INFINITE_REPEAT; c->greedy = 0; - return ; break; case RQ_ASIS: - NODE_BODY(pnode) = cnode; - return ; break; } + return 0; + remove_cnode: NODE_BODY(cnode) = NULL_NODE; onig_node_free(cnode); + return 0; } static int @@ -4018,7 +4003,7 @@ node_new_general_newline(Node** node, ScanEnv* env) alen = ONIGENC_CODE_TO_MBC(env->enc, 0x0a, buf + dlen); if (alen < 0) return alen; - crnl = node_new_str_raw(buf, buf + dlen + alen); + crnl = node_new_str_crude(buf, buf + dlen + alen); CHECK_NULL_RETURN_MEMERR(crnl); ncc = node_new_cclass(); @@ -4046,7 +4031,7 @@ node_new_general_newline(Node** node, ScanEnv* env) if (r != 0) goto err1; } - x = node_new_bag_if_else(crnl, 0, ncc); + x = node_new_bag_if_else(crnl, NULL_NODE, ncc); if (IS_NULL(x)) goto err1; *node = x; @@ -4055,7 +4040,7 @@ node_new_general_newline(Node** node, ScanEnv* env) enum TokenSyms { TK_EOT = 0, /* end of token */ - TK_RAW_BYTE = 1, + TK_CRUDE_BYTE = 1, TK_CHAR, TK_STRING, TK_CODE_POINT, @@ -4070,7 +4055,7 @@ enum TokenSyms { TK_ALT, TK_SUBEXP_OPEN, TK_SUBEXP_CLOSE, - TK_CC_OPEN, + TK_OPEN_CC, TK_QUOTE_OPEN, TK_CHAR_PROPERTY, /* \p{...}, \P{...} */ TK_KEEP, /* \K */ @@ -4082,9 +4067,9 @@ enum TokenSyms { /* in cc */ TK_CC_CLOSE, TK_CC_RANGE, - TK_POSIX_BRACKET_OPEN, - TK_CC_AND, /* && */ - TK_CC_CC_OPEN /* [ */ + TK_CC_POSIX_BRACKET_OPEN, + TK_CC_AND, /* && */ + TK_CC_OPEN_CC /* [ */ }; typedef struct { @@ -4094,7 +4079,7 @@ typedef struct { UChar* backp; union { UChar* s; - int c; + UChar byte; OnigCodePoint code; int anchor; int subtype; @@ -4129,7 +4114,7 @@ typedef struct { static int -fetch_interval_quantifier(UChar** src, UChar* end, PToken* tok, ScanEnv* env) +fetch_interval(UChar** src, UChar* end, PToken* tok, ScanEnv* env) { int low, up, syn_allow, non_low = 0; int r = 0; @@ -4154,7 +4139,7 @@ fetch_interval_quantifier(UChar** src, UChar* end, PToken* tok, ScanEnv* env) } } - low = onig_scan_unsigned_number(&p, end, env->enc); + low = scan_number(&p, end, env->enc); if (low < 0) return ONIGERR_TOO_BIG_NUMBER_FOR_REPEAT_RANGE; if (low > ONIG_MAX_REPEAT_NUM) return ONIGERR_TOO_BIG_NUMBER_FOR_REPEAT_RANGE; @@ -4173,7 +4158,7 @@ fetch_interval_quantifier(UChar** src, UChar* end, PToken* tok, ScanEnv* env) PFETCH(c); if (c == ',') { UChar* prev = p; - up = onig_scan_unsigned_number(&p, end, env->enc); + up = scan_number(&p, end, env->enc); if (up < 0) return ONIGERR_TOO_BIG_NUMBER_FOR_REPEAT_RANGE; if (up > ONIG_MAX_REPEAT_NUM) return ONIGERR_TOO_BIG_NUMBER_FOR_REPEAT_RANGE; @@ -4196,7 +4181,7 @@ fetch_interval_quantifier(UChar** src, UChar* end, PToken* tok, ScanEnv* env) if (PEND) goto invalid; PFETCH(c); if (IS_SYNTAX_OP(env->syntax, ONIG_SYN_OP_ESC_BRACE_INTERVAL)) { - if (c != MC_ESC(env->syntax)) goto invalid; + if (c != MC_ESC(env->syntax) || PEND) goto invalid; PFETCH(c); } if (c != '}') goto invalid; @@ -4419,7 +4404,7 @@ fetch_name_with_level(OnigCodePoint start_code, UChar** src, UChar* end, PFETCH(c); if (! IS_CODE_DIGIT_ASCII(enc, c)) goto err; PUNFETCH; - level = onig_scan_unsigned_number(&p, end, enc); + level = scan_number(&p, end, enc); if (level < 0) return ONIGERR_TOO_BIG_NUMBER; *rlevel = (level * flag); exist_level = 1; @@ -4440,7 +4425,7 @@ fetch_name_with_level(OnigCodePoint start_code, UChar** src, UChar* end, end: if (r == 0) { if (*num_type != IS_NOT_NUM) { - *rback_num = onig_scan_unsigned_number(&pnum_head, name_end, enc); + *rback_num = scan_number(&pnum_head, name_end, enc); if (*rback_num < 0) return ONIGERR_TOO_BIG_NUMBER; else if (*rback_num == 0) { if (*num_type == IS_REL_NUM) @@ -4468,7 +4453,7 @@ fetch_name_with_level(OnigCodePoint start_code, UChar** src, UChar* end, static int fetch_name(OnigCodePoint start_code, UChar** src, UChar* end, UChar** rname_end, ScanEnv* env, int* rback_num, - enum REF_NUM* num_type, int ref) + enum REF_NUM* num_type, int is_ref) { int r, sign; int digit_count; @@ -4498,7 +4483,7 @@ fetch_name(OnigCodePoint start_code, UChar** src, UChar* end, return ONIGERR_EMPTY_GROUP_NAME; if (IS_CODE_DIGIT_ASCII(enc, c)) { - if (ref == 1) + if (is_ref == TRUE) *num_type = IS_ABS_NUM; else { r = ONIGERR_INVALID_GROUP_NAME; @@ -4506,7 +4491,7 @@ fetch_name(OnigCodePoint start_code, UChar** src, UChar* end, digit_count++; } else if (c == '-') { - if (ref == 1) { + if (is_ref == TRUE) { *num_type = IS_REL_NUM; sign = -1; pnum_head = p; @@ -4516,7 +4501,7 @@ fetch_name(OnigCodePoint start_code, UChar** src, UChar* end, } } else if (c == '+') { - if (ref == 1) { + if (is_ref == TRUE) { *num_type = IS_REL_NUM; sign = 1; pnum_head = p; @@ -4566,7 +4551,7 @@ fetch_name(OnigCodePoint start_code, UChar** src, UChar* end, } if (*num_type != IS_NOT_NUM) { - *rback_num = onig_scan_unsigned_number(&pnum_head, name_end, enc); + *rback_num = scan_number(&pnum_head, name_end, enc); if (*rback_num < 0) return ONIGERR_TOO_BIG_NUMBER; else if (*rback_num == 0) { if (*num_type == IS_REL_NUM) { @@ -4698,7 +4683,8 @@ str_exist_check_with_esc(OnigCodePoint s[], int n, UChar* from, UChar* to, static int fetch_token_in_cc(PToken* tok, UChar** src, UChar* end, ScanEnv* env) { - int num; + int r; + OnigCodePoint code; OnigCodePoint c, c2; OnigSyntaxType* syn = env->syntax; OnigEncoding enc = env->enc; @@ -4714,7 +4700,7 @@ fetch_token_in_cc(PToken* tok, UChar** src, UChar* end, ScanEnv* env) PFETCH(c); tok->type = TK_CHAR; tok->base = 0; - tok->u.c = c; + tok->u.code = c; tok->escaped = 0; if (c == ']') { @@ -4731,7 +4717,7 @@ fetch_token_in_cc(PToken* tok, UChar** src, UChar* end, ScanEnv* env) PFETCH(c); tok->escaped = 1; - tok->u.c = c; + tok->u.code = c; switch (c) { case 'w': tok->type = TK_CHAR_TYPE; @@ -4804,8 +4790,8 @@ fetch_token_in_cc(PToken* tok, UChar** src, UChar* end, ScanEnv* env) prev = p; if (PPEEK_IS('{') && IS_SYNTAX_OP(syn, ONIG_SYN_OP_ESC_O_BRACE_OCTAL)) { PINC; - num = scan_unsigned_octal_number(&p, end, 11, enc); - if (num < 0) return ONIGERR_TOO_BIG_WIDE_CHAR_VALUE; + r = scan_octal_number(&p, end, 0, 11, enc, &code); + if (r < 0) return r; if (!PEND) { c2 = PPEEK; if (IS_CODE_DIGIT_ASCII(enc, c2)) @@ -4816,7 +4802,7 @@ fetch_token_in_cc(PToken* tok, UChar** src, UChar* end, ScanEnv* env) PINC; tok->type = TK_CODE_POINT; tok->base = 8; - tok->u.code = (OnigCodePoint )num; + tok->u.code = code; } else { /* can't read nothing or invalid format */ @@ -4831,13 +4817,8 @@ fetch_token_in_cc(PToken* tok, UChar** src, UChar* end, ScanEnv* env) prev = p; if (PPEEK_IS('{') && IS_SYNTAX_OP(syn, ONIG_SYN_OP_ESC_X_BRACE_HEX8)) { PINC; - num = scan_unsigned_hexadecimal_number(&p, end, 0, 8, enc); - if (num < 0) { - if (num == ONIGERR_TOO_BIG_NUMBER) - return ONIGERR_TOO_BIG_WIDE_CHAR_VALUE; - else - return num; - } + r = scan_hexadecimal_number(&p, end, 0, 8, enc, &code); + if (r < 0) return r; if (!PEND) { c2 = PPEEK; if (IS_CODE_XDIGIT_ASCII(enc, c2)) @@ -4848,7 +4829,7 @@ fetch_token_in_cc(PToken* tok, UChar** src, UChar* end, ScanEnv* env) PINC; tok->type = TK_CODE_POINT; tok->base = 16; - tok->u.code = (OnigCodePoint )num; + tok->u.code = code; } else { /* can't read nothing or invalid format */ @@ -4856,14 +4837,14 @@ fetch_token_in_cc(PToken* tok, UChar** src, UChar* end, ScanEnv* env) } } else if (IS_SYNTAX_OP(syn, ONIG_SYN_OP_ESC_X_HEX2)) { - num = scan_unsigned_hexadecimal_number(&p, end, 0, 2, enc); - if (num < 0) return num; + r = scan_hexadecimal_number(&p, end, 0, 2, enc, &code); + if (r < 0) return r; if (p == prev) { /* can't read nothing. */ - num = 0; /* but, it's not error */ + code = 0; /* but, it's not error */ } - tok->type = TK_RAW_BYTE; + tok->type = TK_CRUDE_BYTE; tok->base = 16; - tok->u.c = num; + tok->u.byte = (UChar )code; } break; @@ -4872,14 +4853,14 @@ fetch_token_in_cc(PToken* tok, UChar** src, UChar* end, ScanEnv* env) prev = p; if (IS_SYNTAX_OP2(syn, ONIG_SYN_OP2_ESC_U_HEX4)) { - num = scan_unsigned_hexadecimal_number(&p, end, 4, 4, enc); - if (num < 0) return num; + r = scan_hexadecimal_number(&p, end, 4, 4, enc, &code); + if (r < 0) return r; if (p == prev) { /* can't read nothing. */ - num = 0; /* but, it's not error */ + code = 0; /* but, it's not error */ } tok->type = TK_CODE_POINT; tok->base = 16; - tok->u.code = (OnigCodePoint )num; + tok->u.code = code; } break; @@ -4888,22 +4869,23 @@ fetch_token_in_cc(PToken* tok, UChar** src, UChar* end, ScanEnv* env) if (IS_SYNTAX_OP(syn, ONIG_SYN_OP_ESC_OCTAL3)) { PUNFETCH; prev = p; - num = scan_unsigned_octal_number(&p, end, 3, enc); - if (num < 0 || num >= 256) return ONIGERR_TOO_BIG_NUMBER; + r = scan_octal_number(&p, end, 0, 3, enc, &code); + if (r < 0) return r; + if (code >= 256) return ONIGERR_TOO_BIG_NUMBER; if (p == prev) { /* can't read nothing. */ - num = 0; /* but, it's not error */ + code = 0; /* but, it's not error */ } - tok->type = TK_RAW_BYTE; + tok->type = TK_CRUDE_BYTE; tok->base = 8; - tok->u.c = num; + tok->u.byte = (UChar )code; } break; default: PUNFETCH; - num = fetch_escaped_value(&p, end, env, &c2); - if (num < 0) return num; - if (tok->u.c != c2) { + r = fetch_escaped_value(&p, end, env, &c2); + if (r < 0) return r; + if (tok->u.code != c2) { tok->u.code = c2; tok->type = TK_CODE_POINT; } @@ -4917,7 +4899,7 @@ fetch_token_in_cc(PToken* tok, UChar** src, UChar* end, ScanEnv* env) PINC; if (str_exist_check_with_esc(send, 2, p, end, (OnigCodePoint )']', enc, syn)) { - tok->type = TK_POSIX_BRACKET_OPEN; + tok->type = TK_CC_POSIX_BRACKET_OPEN; } else { PUNFETCH; @@ -4927,7 +4909,7 @@ fetch_token_in_cc(PToken* tok, UChar** src, UChar* end, ScanEnv* env) else { cc_in_cc: if (IS_SYNTAX_OP2(syn, ONIG_SYN_OP2_CCLASS_SET_OP)) { - tok->type = TK_CC_CC_OPEN; + tok->type = TK_CC_OPEN_CC; } else { CC_ESC_WARN(env, (UChar* )"["); @@ -4950,7 +4932,8 @@ fetch_token_in_cc(PToken* tok, UChar** src, UChar* end, ScanEnv* env) static int fetch_token(PToken* tok, UChar** src, UChar* end, ScanEnv* env) { - int r, num; + int r; + OnigCodePoint code; OnigCodePoint c; OnigEncoding enc = env->enc; OnigSyntaxType* syn = env->syntax; @@ -4975,7 +4958,7 @@ fetch_token(PToken* tok, UChar** src, UChar* end, ScanEnv* env) tok->backp = p; PFETCH(c); - tok->u.c = c; + tok->u.code = c; tok->escaped = 1; switch (c) { case '*': @@ -5026,7 +5009,7 @@ fetch_token(PToken* tok, UChar** src, UChar* end, ScanEnv* env) case '{': if (! IS_SYNTAX_OP(syn, ONIG_SYN_OP_ESC_BRACE_INTERVAL)) break; - r = fetch_interval_quantifier(&p, end, tok, env); + r = fetch_interval(&p, end, tok, env); if (r < 0) return r; /* error */ if (r == 0) goto greedy_check2; else if (r == 2) { /* {n} */ @@ -5214,8 +5197,8 @@ fetch_token(PToken* tok, UChar** src, UChar* end, ScanEnv* env) prev = p; if (PPEEK_IS('{') && IS_SYNTAX_OP(syn, ONIG_SYN_OP_ESC_O_BRACE_OCTAL)) { PINC; - num = scan_unsigned_octal_number(&p, end, 11, enc); - if (num < 0) return ONIGERR_TOO_BIG_WIDE_CHAR_VALUE; + r = scan_octal_number(&p, end, 0, 11, enc, &code); + if (r < 0) return r; if (!PEND) { if (IS_CODE_DIGIT_ASCII(enc, PPEEK)) return ONIGERR_TOO_LONG_WIDE_CHAR_VALUE; @@ -5224,7 +5207,7 @@ fetch_token(PToken* tok, UChar** src, UChar* end, ScanEnv* env) if ((p > prev + enclen(enc, prev)) && !PEND && PPEEK_IS('}')) { PINC; tok->type = TK_CODE_POINT; - tok->u.code = (OnigCodePoint )num; + tok->u.code = code; } else { /* can't read nothing or invalid format */ @@ -5239,13 +5222,8 @@ fetch_token(PToken* tok, UChar** src, UChar* end, ScanEnv* env) prev = p; if (PPEEK_IS('{') && IS_SYNTAX_OP(syn, ONIG_SYN_OP_ESC_X_BRACE_HEX8)) { PINC; - num = scan_unsigned_hexadecimal_number(&p, end, 0, 8, enc); - if (num < 0) { - if (num == ONIGERR_TOO_BIG_NUMBER) - return ONIGERR_TOO_BIG_WIDE_CHAR_VALUE; - else - return num; - } + r = scan_hexadecimal_number(&p, end, 0, 8, enc, &code); + if (r < 0) return r; if (!PEND) { if (IS_CODE_XDIGIT_ASCII(enc, PPEEK)) return ONIGERR_TOO_LONG_WIDE_CHAR_VALUE; @@ -5254,7 +5232,7 @@ fetch_token(PToken* tok, UChar** src, UChar* end, ScanEnv* env) if ((p > prev + enclen(enc, prev)) && !PEND && PPEEK_IS('}')) { PINC; tok->type = TK_CODE_POINT; - tok->u.code = (OnigCodePoint )num; + tok->u.code = code; } else { /* can't read nothing or invalid format */ @@ -5262,14 +5240,14 @@ fetch_token(PToken* tok, UChar** src, UChar* end, ScanEnv* env) } } else if (IS_SYNTAX_OP(syn, ONIG_SYN_OP_ESC_X_HEX2)) { - num = scan_unsigned_hexadecimal_number(&p, end, 0, 2, enc); - if (num < 0) return num; + r = scan_hexadecimal_number(&p, end, 0, 2, enc, &code); + if (r < 0) return r; if (p == prev) { /* can't read nothing. */ - num = 0; /* but, it's not error */ + code = 0; /* but, it's not error */ } - tok->type = TK_RAW_BYTE; + tok->type = TK_CRUDE_BYTE; tok->base = 16; - tok->u.c = num; + tok->u.byte = (UChar )code; } break; @@ -5278,14 +5256,14 @@ fetch_token(PToken* tok, UChar** src, UChar* end, ScanEnv* env) prev = p; if (IS_SYNTAX_OP2(syn, ONIG_SYN_OP2_ESC_U_HEX4)) { - num = scan_unsigned_hexadecimal_number(&p, end, 4, 4, enc); - if (num < 0) return num; + r = scan_hexadecimal_number(&p, end, 4, 4, enc, &code); + if (r < 0) return r; if (p == prev) { /* can't read nothing. */ - num = 0; /* but, it's not error */ + code = 0; /* but, it's not error */ } tok->type = TK_CODE_POINT; tok->base = 16; - tok->u.code = (OnigCodePoint )num; + tok->u.code = code; } break; @@ -5293,21 +5271,21 @@ fetch_token(PToken* tok, UChar** src, UChar* end, ScanEnv* env) case '5': case '6': case '7': case '8': case '9': PUNFETCH; prev = p; - num = onig_scan_unsigned_number(&p, end, enc); - if (num < 0 || num > ONIG_MAX_BACKREF_NUM) { + r = scan_number(&p, end, enc); + if (r < 0 || r > ONIG_MAX_BACKREF_NUM) { goto skip_backref; } if (IS_SYNTAX_OP(syn, ONIG_SYN_OP_DECIMAL_BACKREF) && - (num <= env->num_mem || num <= 9)) { /* This spec. from GNU regex */ + (r <= env->num_mem || r <= 9)) { /* This spec. from GNU regex */ if (IS_SYNTAX_BV(syn, ONIG_SYN_STRICT_CHECK_BACKREF)) { - if (num > env->num_mem || IS_NULL(SCANENV_MEMENV(env)[num].node)) + if (r > env->num_mem || IS_NULL(SCANENV_MEMENV(env)[r].mem_node)) return ONIGERR_INVALID_BACKREF; } tok->type = TK_BACKREF; tok->u.backref.num = 1; - tok->u.backref.ref1 = num; + tok->u.backref.ref1 = r; tok->u.backref.by_name = 0; #ifdef USE_BACKREF_WITH_LEVEL tok->u.backref.exist_level = 0; @@ -5327,14 +5305,14 @@ fetch_token(PToken* tok, UChar** src, UChar* end, ScanEnv* env) case '0': if (IS_SYNTAX_OP(syn, ONIG_SYN_OP_ESC_OCTAL3)) { prev = p; - num = scan_unsigned_octal_number(&p, end, (c == '0' ? 2:3), enc); - if (num < 0 || num >= 256) return ONIGERR_TOO_BIG_NUMBER; + r = scan_octal_number(&p, end, 0, (c == '0' ? 2:3), enc, &code); + if (r < 0 || r >= 256) return ONIGERR_TOO_BIG_NUMBER; if (p == prev) { /* can't read nothing. */ - num = 0; /* but, it's not error */ + code = 0; /* but, it's not error */ } - tok->type = TK_RAW_BYTE; + tok->type = TK_CRUDE_BYTE; tok->base = 8; - tok->u.c = num; + tok->u.byte = (UChar )code; } else if (c != '0') { PINC; @@ -5359,7 +5337,7 @@ fetch_token(PToken* tok, UChar** src, UChar* end, ScanEnv* env) if (r == 1) tok->u.backref.exist_level = 1; else tok->u.backref.exist_level = 0; #else - r = fetch_name(c, &p, end, &name_end, env, &back_num, &num_type, 1); + r = fetch_name(c, &p, end, &name_end, env, &back_num, &num_type, TRUE); #endif if (r < 0) return r; @@ -5372,7 +5350,7 @@ fetch_token(PToken* tok, UChar** src, UChar* end, ScanEnv* env) if (IS_SYNTAX_BV(syn, ONIG_SYN_STRICT_CHECK_BACKREF)) { if (back_num > env->num_mem || - IS_NULL(SCANENV_MEMENV(env)[back_num].node)) + IS_NULL(SCANENV_MEMENV(env)[back_num].mem_node)) return ONIGERR_INVALID_BACKREF; } tok->type = TK_BACKREF; @@ -5381,7 +5359,7 @@ fetch_token(PToken* tok, UChar** src, UChar* end, ScanEnv* env) tok->u.backref.ref1 = back_num; } else { - num = name_to_group_numbers(env, prev, name_end, &backs); + int num = name_to_group_numbers(env, prev, name_end, &backs); if (num <= 0) { return ONIGERR_UNDEFINED_NAME_REFERENCE; } @@ -5389,7 +5367,7 @@ fetch_token(PToken* tok, UChar** src, UChar* end, ScanEnv* env) int i; for (i = 0; i < num; i++) { if (backs[i] > env->num_mem || - IS_NULL(SCANENV_MEMENV(env)[backs[i]].node)) + IS_NULL(SCANENV_MEMENV(env)[backs[i]].mem_node)) return ONIGERR_INVALID_BACKREF; } } @@ -5422,7 +5400,7 @@ fetch_token(PToken* tok, UChar** src, UChar* end, ScanEnv* env) prev = p; r = fetch_name((OnigCodePoint )c, &p, end, &name_end, env, - &gnum, &num_type, 1); + &gnum, &num_type, TRUE); if (r < 0) return r; if (num_type != IS_NOT_NUM) { @@ -5483,10 +5461,9 @@ fetch_token(PToken* tok, UChar** src, UChar* end, ScanEnv* env) OnigCodePoint c2; PUNFETCH; - num = fetch_escaped_value(&p, end, env, &c2); - if (num < 0) return num; - /* set_raw: */ - if (tok->u.c != c2) { + r = fetch_escaped_value(&p, end, env, &c2); + if (r < 0) return r; + if (tok->u.code != c2) { tok->type = TK_CODE_POINT; tok->u.code = c2; } @@ -5498,7 +5475,7 @@ fetch_token(PToken* tok, UChar** src, UChar* end, ScanEnv* env) } } else { - tok->u.c = c; + tok->u.code = c; tok->escaped = 0; #ifdef USE_VARIABLE_META_CHARS @@ -5563,7 +5540,7 @@ fetch_token(PToken* tok, UChar** src, UChar* end, ScanEnv* env) case '{': if (! IS_SYNTAX_OP(syn, ONIG_SYN_OP_BRACE_INTERVAL)) break; - r = fetch_interval_quantifier(&p, end, tok, env); + r = fetch_interval(&p, end, tok, env); if (r < 0) return r; /* error */ if (r == 0) goto greedy_check2; else if (r == 2) { /* {n} */ @@ -5611,8 +5588,8 @@ fetch_token(PToken* tok, UChar** src, UChar* end, ScanEnv* env) { PINC; name = p; - r = fetch_name((OnigCodePoint )'(', &p, end, &name_end, env, &gnum, - &num_type, 0); + r = fetch_name((OnigCodePoint )'(', &p, end, &name_end, env, + &gnum, &num_type, FALSE); if (r < 0) return r; tok->type = TK_CALL; @@ -5644,7 +5621,7 @@ fetch_token(PToken* tok, UChar** src, UChar* end, ScanEnv* env) { name = p; r = fetch_name((OnigCodePoint )'(', &p, end, &name_end, env, - &gnum, &num_type, 1); + &gnum, &num_type, TRUE); if (r < 0) return r; if (num_type == IS_NOT_NUM) { @@ -5700,7 +5677,7 @@ fetch_token(PToken* tok, UChar** src, UChar* end, ScanEnv* env) case '[': if (! IS_SYNTAX_OP(syn, ONIG_SYN_OP_BRACKET_CC)) break; - tok->type = TK_CC_OPEN; + tok->type = TK_OPEN_CC; break; case ']': @@ -5911,6 +5888,7 @@ add_ctype_to_cc(CClassNode* cc, int ctype, int not, ScanEnv* env) int c, r; int ascii_mode; + int is_single; const OnigCodePoint *ranges; OnigCodePoint limit; OnigCodePoint sb_out; @@ -5932,6 +5910,7 @@ add_ctype_to_cc(CClassNode* cc, int ctype, int not, ScanEnv* env) } r = 0; + is_single = ONIGENC_IS_SINGLEBYTE(enc); limit = ascii_mode ? ASCII_LIMIT : SINGLE_BYTE_SIZE; switch (ctype) { @@ -5948,19 +5927,25 @@ add_ctype_to_cc(CClassNode* cc, int ctype, int not, ScanEnv* env) case ONIGENC_CTYPE_ALNUM: if (not != 0) { for (c = 0; c < (int )limit; c++) { - if (! ONIGENC_IS_CODE_CTYPE(enc, (OnigCodePoint )c, ctype)) - BITSET_SET_BIT(cc->bs, c); + if (is_single != 0 || ONIGENC_CODE_TO_MBCLEN(enc, c) == 1) { + if (! ONIGENC_IS_CODE_CTYPE(enc, (OnigCodePoint )c, ctype)) + BITSET_SET_BIT(cc->bs, c); + } } for (c = limit; c < SINGLE_BYTE_SIZE; c++) { - BITSET_SET_BIT(cc->bs, c); + if (is_single != 0 || ONIGENC_CODE_TO_MBCLEN(enc, c) == 1) + BITSET_SET_BIT(cc->bs, c); } - ADD_ALL_MULTI_BYTE_RANGE(enc, cc->mbuf); + if (is_single == 0) + ADD_ALL_MULTI_BYTE_RANGE(enc, cc->mbuf); } else { for (c = 0; c < (int )limit; c++) { - if (ONIGENC_IS_CODE_CTYPE(enc, (OnigCodePoint )c, ctype)) - BITSET_SET_BIT(cc->bs, c); + if (is_single != 0 || ONIGENC_CODE_TO_MBCLEN(enc, c) == 1) { + if (ONIGENC_IS_CODE_CTYPE(enc, (OnigCodePoint )c, ctype)) + BITSET_SET_BIT(cc->bs, c); + } } } break; @@ -5970,21 +5955,25 @@ add_ctype_to_cc(CClassNode* cc, int ctype, int not, ScanEnv* env) case ONIGENC_CTYPE_WORD: if (not != 0) { for (c = 0; c < (int )limit; c++) { - if (ONIGENC_CODE_TO_MBCLEN(enc, c) > 0 /* check invalid code point */ + /* check invalid code point */ + if ((is_single != 0 || ONIGENC_CODE_TO_MBCLEN(enc, c) == 1) && ! ONIGENC_IS_CODE_CTYPE(enc, (OnigCodePoint )c, ctype)) BITSET_SET_BIT(cc->bs, c); } for (c = limit; c < SINGLE_BYTE_SIZE; c++) { - if (ONIGENC_CODE_TO_MBCLEN(enc, c) > 0) + if (is_single != 0 || ONIGENC_CODE_TO_MBCLEN(enc, c) == 1) BITSET_SET_BIT(cc->bs, c); } + if (ascii_mode != 0 && is_single == 0) + ADD_ALL_MULTI_BYTE_RANGE(enc, cc->mbuf); } else { for (c = 0; c < (int )limit; c++) { - if (ONIGENC_IS_CODE_CTYPE(enc, (OnigCodePoint )c, ctype)) + if ((is_single != 0 || ONIGENC_CODE_TO_MBCLEN(enc, c) == 1) + && ONIGENC_IS_CODE_CTYPE(enc, (OnigCodePoint )c, ctype)) BITSET_SET_BIT(cc->bs, c); } - if (ascii_mode == 0) + if (ascii_mode == 0 && is_single == 0) ADD_ALL_MULTI_BYTE_RANGE(enc, cc->mbuf); } break; @@ -6076,10 +6065,12 @@ fetch_char_property_to_ctype(UChar** src, UChar* end, ScanEnv* env) { int r; OnigCodePoint c; - OnigEncoding enc = env->enc; - UChar *prev, *start, *p = *src; + OnigEncoding enc; + UChar *prev, *start, *p; - r = 0; + p = *src; + enc = env->enc; + r = ONIGERR_END_PATTERN_WITH_UNMATCHED_PARENTHESIS; start = prev = p; while (!PEND) { @@ -6087,18 +6078,20 @@ fetch_char_property_to_ctype(UChar** src, UChar* end, ScanEnv* env) PFETCH_S(c); if (c == '}') { r = ONIGENC_PROPERTY_NAME_TO_CTYPE(enc, start, prev); - if (r < 0) break; + if (r >= 0) { + *src = p; + } + else { + onig_scan_env_set_error_string(env, r, *src, prev); + } - *src = p; return r; } else if (c == '(' || c == ')' || c == '{' || c == '|') { - r = ONIGERR_INVALID_CHAR_PROPERTY_NAME; break; } } - onig_scan_env_set_error_string(env, r, *src, prev); return r; } @@ -6114,7 +6107,7 @@ parse_char_property(Node** np, PToken* tok, UChar** src, UChar* end, ScanEnv* en *np = node_new_cclass(); CHECK_NULL_RETURN_MEMERR(*np); cc = CCLASS_(*np); - r = add_ctype_to_cc(cc, ctype, 0, env); + r = add_ctype_to_cc(cc, ctype, FALSE, env); if (r != 0) return r; if (tok->u.prop.not != 0) NCCLASS_SET_NOT(cc); @@ -6122,67 +6115,67 @@ parse_char_property(Node** np, PToken* tok, UChar** src, UChar* end, ScanEnv* en } -enum CCSTATE { - CCS_VALUE, - CCS_RANGE, - CCS_COMPLETE, - CCS_START -}; +typedef enum { + CS_VALUE, + CS_RANGE, + CS_COMPLETE, + CS_START +} CSTATE; -enum CCVALTYPE { - CCV_SB, - CCV_CODE_POINT, - CCV_CLASS -}; +typedef enum { + CV_UNDEF, + CV_SB, + CV_MB, + CV_CPROP +} CVAL; static int -next_state_class(CClassNode* cc, OnigCodePoint* vs, enum CCVALTYPE* type, - enum CCSTATE* state, ScanEnv* env) +cc_cprop_next(CClassNode* cc, OnigCodePoint* pcode, CVAL* val, CSTATE* state, + ScanEnv* env) { int r; - if (*state == CCS_RANGE) + if (*state == CS_RANGE) return ONIGERR_CHAR_CLASS_VALUE_AT_END_OF_RANGE; - if (*state == CCS_VALUE && *type != CCV_CLASS) { - if (*type == CCV_SB) - BITSET_SET_BIT(cc->bs, (int )(*vs)); - else if (*type == CCV_CODE_POINT) { - r = add_code_range(&(cc->mbuf), env, *vs, *vs); + if (*state == CS_VALUE) { + if (*val == CV_SB) + BITSET_SET_BIT(cc->bs, (int )(*pcode)); + else if (*val == CV_MB) { + r = add_code_range(&(cc->mbuf), env, *pcode, *pcode); if (r < 0) return r; } } - *state = CCS_VALUE; - *type = CCV_CLASS; + *state = CS_VALUE; + *val = CV_CPROP; return 0; } static int -next_state_val(CClassNode* cc, OnigCodePoint *from, OnigCodePoint to, - int* from_israw, int to_israw, - enum CCVALTYPE intype, enum CCVALTYPE* type, - enum CCSTATE* state, ScanEnv* env) +cc_char_next(CClassNode* cc, OnigCodePoint *from, OnigCodePoint to, + int* from_raw, int to_raw, CVAL intype, CVAL* type, + CSTATE* state, ScanEnv* env) { int r; switch (*state) { - case CCS_VALUE: - if (*type == CCV_SB) { + case CS_VALUE: + if (*type == CV_SB) { if (*from > 0xff) return ONIGERR_INVALID_CODE_POINT_VALUE; BITSET_SET_BIT(cc->bs, (int )(*from)); } - else if (*type == CCV_CODE_POINT) { + else if (*type == CV_MB) { r = add_code_range(&(cc->mbuf), env, *from, *from); if (r < 0) return r; } break; - case CCS_RANGE: + case CS_RANGE: if (intype == *type) { - if (intype == CCV_SB) { + if (intype == CV_SB) { if (*from > 0xff || to > 0xff) return ONIGERR_INVALID_CODE_POINT_VALUE; @@ -6211,21 +6204,21 @@ next_state_val(CClassNode* cc, OnigCodePoint *from, OnigCodePoint to, if (r < 0) return r; } ccs_range_end: - *state = CCS_COMPLETE; + *state = CS_COMPLETE; break; - case CCS_COMPLETE: - case CCS_START: - *state = CCS_VALUE; + case CS_COMPLETE: + case CS_START: + *state = CS_VALUE; break; default: break; } - *from_israw = to_israw; - *from = to; - *type = intype; + *from_raw = to_raw; + *from = to; + *type = intype; return 0; } @@ -6253,27 +6246,25 @@ code_exist_check(OnigCodePoint c, UChar* from, UChar* end, int ignore_escaped, } static int -parse_char_class(Node** np, PToken* tok, UChar** src, UChar* end, ScanEnv* env) +parse_cc(Node** np, PToken* tok, UChar** src, UChar* end, ScanEnv* env) { int r, neg, len, fetched, and_start; - OnigCodePoint v, vs; + OnigCodePoint in_code, curr_code; UChar *p; Node* node; CClassNode *cc, *prev_cc; CClassNode work_cc; - - enum CCSTATE state; - enum CCVALTYPE val_type, in_type; - int val_israw, in_israw; + int curr_raw, in_raw; + CSTATE state; + CVAL in_type; + CVAL curr_type; *np = NULL_NODE; - env->parse_depth++; - if (env->parse_depth > ParseDepthLimit) - return ONIGERR_PARSE_DEPTH_LIMIT_OVER; + INC_PARSE_DEPTH(env->parse_depth); prev_cc = (CClassNode* )NULL; r = fetch_token_in_cc(tok, src, end, env); - if (r == TK_CHAR && tok->u.c == '^' && tok->escaped == 0) { + if (r == TK_CHAR && tok->u.code == (OnigCodePoint )'^' && tok->escaped == 0) { neg = 1; r = fetch_token_in_cc(tok, src, end, env); } @@ -6296,31 +6287,27 @@ parse_char_class(Node** np, PToken* tok, UChar** src, UChar* end, ScanEnv* env) cc = CCLASS_(node); and_start = 0; - state = CCS_START; + state = CS_START; + curr_type = CV_UNDEF; + p = *src; while (r != TK_CC_CLOSE) { fetched = 0; switch (r) { case TK_CHAR: any_char_in: - len = ONIGENC_CODE_TO_MBCLEN(env->enc, tok->u.c); - if (len > 1) { - in_type = CCV_CODE_POINT; - } - else if (len < 0) { + len = ONIGENC_CODE_TO_MBCLEN(env->enc, tok->u.code); + if (len < 0) { r = len; goto err; } - else { - /* sb_char: */ - in_type = CCV_SB; - } - v = (OnigCodePoint )tok->u.c; - in_israw = 0; + in_type = (len == 1) ? CV_SB : CV_MB; + in_code = tok->u.code; + in_raw = 0; goto val_entry2; break; - case TK_RAW_BYTE: + case TK_CRUDE_BYTE: /* tok->base != 0 : octal or hexadec. */ if (! ONIGENC_IS_SINGLEBYTE(env->enc) && tok->base != 0) { int i, j; @@ -6329,15 +6316,15 @@ parse_char_class(Node** np, PToken* tok, UChar** src, UChar* end, ScanEnv* env) UChar* psave = p; int base = tok->base; - buf[0] = tok->u.c; + buf[0] = tok->u.byte; for (i = 1; i < ONIGENC_MBC_MAXLEN(env->enc); i++) { r = fetch_token_in_cc(tok, &p, end, env); if (r < 0) goto err; - if (r != TK_RAW_BYTE || tok->base != base) { + if (r != TK_CRUDE_BYTE || tok->base != base) { fetched = 1; break; } - buf[i] = tok->u.c; + buf[i] = tok->u.byte; } if (i < ONIGENC_MBC_MINLEN(env->enc)) { @@ -6362,63 +6349,63 @@ parse_char_class(Node** np, PToken* tok, UChar** src, UChar* end, ScanEnv* env) } if (i == 1) { - v = (OnigCodePoint )buf[0]; - goto raw_single; + in_code = (OnigCodePoint )buf[0]; + goto crude_single; } else { - v = ONIGENC_MBC_TO_CODE(env->enc, buf, bufe); - in_type = CCV_CODE_POINT; + in_code = ONIGENC_MBC_TO_CODE(env->enc, buf, bufe); + in_type = CV_MB; } } else { - v = (OnigCodePoint )tok->u.c; - raw_single: - in_type = CCV_SB; + in_code = (OnigCodePoint )tok->u.byte; + crude_single: + in_type = CV_SB; } - in_israw = 1; + in_raw = 1; goto val_entry2; break; case TK_CODE_POINT: - v = tok->u.code; - in_israw = 1; + in_code = tok->u.code; + in_raw = 1; val_entry: - len = ONIGENC_CODE_TO_MBCLEN(env->enc, v); + len = ONIGENC_CODE_TO_MBCLEN(env->enc, in_code); if (len < 0) { - if (state != CCS_RANGE || + if (state != CS_RANGE || ! IS_SYNTAX_BV(env->syntax, ONIG_SYN_ALLOW_INVALID_CODE_END_OF_RANGE_IN_CC) || - v < 0x100 || ONIGENC_MBC_MAXLEN(env->enc) == 1) { + in_code < 0x100 || ONIGENC_MBC_MAXLEN(env->enc) == 1) { r = len; goto err; } } - in_type = (len == 1 ? CCV_SB : CCV_CODE_POINT); + in_type = (len == 1 ? CV_SB : CV_MB); val_entry2: - r = next_state_val(cc, &vs, v, &val_israw, in_israw, in_type, &val_type, - &state, env); + r = cc_char_next(cc, &curr_code, in_code, &curr_raw, in_raw, in_type, + &curr_type, &state, env); if (r != 0) goto err; break; - case TK_POSIX_BRACKET_OPEN: + case TK_CC_POSIX_BRACKET_OPEN: r = parse_posix_bracket(cc, &p, end, env); if (r < 0) goto err; if (r == 1) { /* is not POSIX bracket */ CC_ESC_WARN(env, (UChar* )"["); p = tok->backp; - v = (OnigCodePoint )tok->u.c; - in_israw = 0; + in_code = tok->u.code; + in_raw = 0; goto val_entry; } - goto next_class; + goto next_cprop; break; case TK_CHAR_TYPE: r = add_ctype_to_cc(cc, tok->u.prop.ctype, tok->u.prop.not, env); if (r != 0) goto err; - next_class: - r = next_state_class(cc, &vs, &val_type, &state, env); + next_cprop: + r = cc_cprop_next(cc, &curr_code, &curr_type, &state, env); if (r != 0) goto err; break; @@ -6431,19 +6418,20 @@ parse_char_class(Node** np, PToken* tok, UChar** src, UChar* end, ScanEnv* env) } r = add_ctype_to_cc(cc, ctype, tok->u.prop.not, env); if (r != 0) goto err; - goto next_class; + goto next_cprop; } break; case TK_CC_RANGE: - if (state == CCS_VALUE) { + if (state == CS_VALUE) { r = fetch_token_in_cc(tok, &p, end, env); if (r < 0) goto err; + fetched = 1; if (r == TK_CC_CLOSE) { /* allow [x-] */ range_end_val: - v = (OnigCodePoint )'-'; - in_israw = 0; + in_code = (OnigCodePoint )'-'; + in_raw = 0; goto val_entry; } else if (r == TK_CC_AND) { @@ -6451,20 +6439,21 @@ parse_char_class(Node** np, PToken* tok, UChar** src, UChar* end, ScanEnv* env) goto range_end_val; } - if (val_type == CCV_CLASS) { + if (curr_type == CV_CPROP) { r = ONIGERR_UNMATCHED_RANGE_SPECIFIER_IN_CHAR_CLASS; goto err; } - state = CCS_RANGE; + state = CS_RANGE; } - else if (state == CCS_START) { + else if (state == CS_START) { /* [-xa] is allowed */ - v = (OnigCodePoint )tok->u.c; - in_israw = 0; + in_code = tok->u.code; + in_raw = 0; r = fetch_token_in_cc(tok, &p, end, env); if (r < 0) goto err; + fetched = 1; /* [--x] or [a&&-x] is warned. */ if (r == TK_CC_RANGE || and_start != 0) @@ -6472,15 +6461,17 @@ parse_char_class(Node** np, PToken* tok, UChar** src, UChar* end, ScanEnv* env) goto val_entry; } - else if (state == CCS_RANGE) { + else if (state == CS_RANGE) { CC_ESC_WARN(env, (UChar* )"-"); - goto any_char_in; /* [!--x] is allowed */ + goto any_char_in; /* [!--] is allowed */ } - else { /* CCS_COMPLETE */ + else { /* CS_COMPLETE */ r = fetch_token_in_cc(tok, &p, end, env); if (r < 0) goto err; + fetched = 1; - if (r == TK_CC_CLOSE) goto range_end_val; /* allow [a-b-] */ + if (r == TK_CC_CLOSE) + goto range_end_val; /* allow [a-b-] */ else if (r == TK_CC_AND) { CC_ESC_WARN(env, (UChar* )"-"); goto range_end_val; @@ -6495,12 +6486,19 @@ parse_char_class(Node** np, PToken* tok, UChar** src, UChar* end, ScanEnv* env) } break; - case TK_CC_CC_OPEN: /* [ */ + case TK_CC_OPEN_CC: /* [ */ { Node *anode; CClassNode* acc; - r = parse_char_class(&anode, tok, &p, end, env); + if (state == CS_VALUE) { + r = cc_char_next(cc, &curr_code, 0, &curr_raw, 0, curr_type, &curr_type, + &state, env); + if (r != 0) goto err; + } + state = CS_COMPLETE; + + r = parse_cc(&anode, tok, &p, end, env); if (r != 0) { onig_node_free(anode); goto cc_open_err; @@ -6516,14 +6514,14 @@ parse_char_class(Node** np, PToken* tok, UChar** src, UChar* end, ScanEnv* env) case TK_CC_AND: /* && */ { - if (state == CCS_VALUE) { - r = next_state_val(cc, &vs, 0, &val_israw, 0, val_type, - &val_type, &state, env); + if (state == CS_VALUE) { + r = cc_char_next(cc, &curr_code, 0, &curr_raw, 0, curr_type, &curr_type, + &state, env); if (r != 0) goto err; } /* initialize local variables */ and_start = 1; - state = CCS_START; + state = CS_START; if (IS_NOT_NULL(prev_cc)) { r = and_cclass(prev_cc, cc, env->enc); @@ -6556,9 +6554,9 @@ parse_char_class(Node** np, PToken* tok, UChar** src, UChar* end, ScanEnv* env) } } - if (state == CCS_VALUE) { - r = next_state_val(cc, &vs, 0, &val_israw, 0, val_type, - &val_type, &state, env); + if (state == CS_VALUE) { + r = cc_char_next(cc, &curr_code, 0, &curr_raw, 0, curr_type, &curr_type, + &state, env); if (r != 0) goto err; } @@ -6591,7 +6589,7 @@ parse_char_class(Node** np, PToken* tok, UChar** src, UChar* end, ScanEnv* env) } } *src = p; - env->parse_depth--; + DEC_PARSE_DEPTH(env->parse_depth); return 0; err: @@ -6600,8 +6598,8 @@ parse_char_class(Node** np, PToken* tok, UChar** src, UChar* end, ScanEnv* env) return r; } -static int parse_subexp(Node** top, PToken* tok, int term, - UChar** src, UChar* end, ScanEnv* env, int group_head); +static int parse_alts(Node** top, PToken* tok, int term, + UChar** src, UChar* end, ScanEnv* env, int group_head); #ifdef USE_CALLOUT @@ -6772,7 +6770,8 @@ parse_long(OnigEncoding enc, UChar* s, UChar* end, int sign_on, long max, long* static int parse_callout_args(int skip_mode, int cterm, UChar** src, UChar* end, - unsigned int types[], OnigValue vals[], ScanEnv* env) + int max_arg_num, unsigned int types[], OnigValue vals[], + ScanEnv* env) { #define MAX_CALLOUT_ARG_BYTE_LENGTH 128 @@ -6791,9 +6790,9 @@ parse_callout_args(int skip_mode, int cterm, UChar** src, UChar* end, if (PEND) return ONIGERR_INVALID_CALLOUT_PATTERN; + c = 0; n = 0; while (n < ONIG_CALLOUT_MAX_ARGS_NUM) { - c = 0; cn = 0; esc = 0; eesc = 0; @@ -6826,7 +6825,7 @@ parse_callout_args(int skip_mode, int cterm, UChar** src, UChar* end, size_t clen; add_char: - if (skip_mode == 0) { + if (skip_mode == FALSE) { clen = p - e; if (bufend + clen > buf + MAX_CALLOUT_ARG_BYTE_LENGTH) return ONIGERR_INVALID_CALLOUT_ARG; /* too long argument */ @@ -6840,7 +6839,10 @@ parse_callout_args(int skip_mode, int cterm, UChar** src, UChar* end, } if (cn != 0) { - if (skip_mode == 0) { + if (max_arg_num >= 0 && n >= max_arg_num) + return ONIGERR_INVALID_CALLOUT_ARG; + + if (skip_mode == FALSE) { if ((types[n] & ONIG_TYPE_LONG) != 0) { int fixed = 0; if (cn > 0) { @@ -6972,7 +6974,7 @@ parse_callout_of_name(Node** np, int cterm, UChar** src, UChar* end, ScanEnv* en /* read for single check only */ save = p; - arg_num = parse_callout_args(1, '}', &p, end, 0, 0, env); + arg_num = parse_callout_args(TRUE, '}', &p, end, -1, NULL, NULL, env); if (arg_num < 0) return arg_num; is_not_single = PPEEK_IS(cterm) ? 0 : 1; @@ -6986,7 +6988,7 @@ parse_callout_of_name(Node** np, int cterm, UChar** src, UChar* end, ScanEnv* en types[i] = get_callout_arg_type_by_name_id(name_id, i); } - arg_num = parse_callout_args(0, '}', &p, end, types, vals, env); + arg_num = parse_callout_args(FALSE, '}', &p, end, max_arg_num, types, vals, env); if (arg_num < 0) return arg_num; if (PEND) return ONIGERR_END_PATTERN_IN_GROUP; @@ -7086,17 +7088,17 @@ parse_bag(Node** np, PToken* tok, int term, UChar** src, UChar* end, group: r = fetch_token(tok, &p, end, env); if (r < 0) return r; - r = parse_subexp(np, tok, term, &p, end, env, 0); + r = parse_alts(np, tok, term, &p, end, env, FALSE); if (r < 0) return r; *src = p; return 1; /* group */ break; case '=': - *np = onig_node_new_anchor(ANCR_PREC_READ, 0); + *np = onig_node_new_anchor(ANCR_PREC_READ, FALSE); break; case '!': /* preceding read */ - *np = onig_node_new_anchor(ANCR_PREC_READ_NOT, 0); + *np = onig_node_new_anchor(ANCR_PREC_READ_NOT, FALSE); break; case '>': /* (?>...) stop backtrack */ *np = node_new_bag(BAG_STOP_BACKTRACK); @@ -7114,9 +7116,9 @@ parse_bag(Node** np, PToken* tok, int term, UChar** src, UChar* end, if (PEND) return ONIGERR_END_PATTERN_WITH_UNMATCHED_PARENTHESIS; PFETCH(c); if (c == '=') - *np = onig_node_new_anchor(ANCR_LOOK_BEHIND, 0); + *np = onig_node_new_anchor(ANCR_LOOK_BEHIND, FALSE); else if (c == '!') - *np = onig_node_new_anchor(ANCR_LOOK_BEHIND_NOT, 0); + *np = onig_node_new_anchor(ANCR_LOOK_BEHIND_NOT, FALSE); else { if (IS_SYNTAX_OP2(env->syntax, ONIG_SYN_OP2_QMARK_LT_NAMED_GROUP)) { UChar *name; @@ -7132,7 +7134,7 @@ parse_bag(Node** np, PToken* tok, int term, UChar** src, UChar* end, named_group2: name = p; r = fetch_name((OnigCodePoint )c, &p, end, &name_end, env, &num, - &num_type, 0); + &num_type, FALSE); if (r < 0) return r; num = scan_env_add_mem_entry(env); @@ -7146,7 +7148,7 @@ parse_bag(Node** np, PToken* tok, int term, UChar** src, UChar* end, CHECK_NULL_RETURN_MEMERR(*np); BAG_(*np)->m.regnum = num; if (list_capture != 0) - MEM_STATUS_ON_SIMPLE(env->capture_history, num); + MEM_STATUS_ON_SIMPLE(env->cap_history, num); env->num_named++; } else { @@ -7181,7 +7183,7 @@ parse_bag(Node** np, PToken* tok, int term, UChar** src, UChar* end, r = fetch_token(tok, &p, end, env); if (r < 0) return r; - r = parse_subexp(&absent, tok, term, &p, end, env, 1); + r = parse_alts(&absent, tok, term, &p, end, env, TRUE); if (r < 0) { onig_node_free(absent); return r; @@ -7268,7 +7270,7 @@ parse_bag(Node** np, PToken* tok, int term, UChar** src, UChar* end, if (r == 1) exist_level = 1; #else r = fetch_name((OnigCodePoint )(is_enclosed != 0 ? c : '('), - &p, end, &name_end, env, &back_num, &num_type, 1); + &p, end, &name_end, env, &back_num, &num_type, TRUE); #endif if (r < 0) { if (is_enclosed == 0) { @@ -7288,11 +7290,11 @@ parse_bag(Node** np, PToken* tok, int term, UChar** src, UChar* end, if (IS_SYNTAX_BV(env->syntax, ONIG_SYN_STRICT_CHECK_BACKREF)) { if (back_num > env->num_mem || - IS_NULL(SCANENV_MEMENV(env)[back_num].node)) + IS_NULL(SCANENV_MEMENV(env)[back_num].mem_node)) return ONIGERR_INVALID_BACKREF; } - condition = node_new_backref_checker(1, &back_num, 0, + condition = node_new_backref_checker(1, &back_num, FALSE, #ifdef USE_BACKREF_WITH_LEVEL exist_level, level, #endif @@ -7310,12 +7312,12 @@ parse_bag(Node** np, PToken* tok, int term, UChar** src, UChar* end, int i; for (i = 0; i < num; i++) { if (backs[i] > env->num_mem || - IS_NULL(SCANENV_MEMENV(env)[backs[i]].node)) + IS_NULL(SCANENV_MEMENV(env)[backs[i]].mem_node)) return ONIGERR_INVALID_BACKREF; } } - condition = node_new_backref_checker(num, backs, 1, + condition = node_new_backref_checker(num, backs, TRUE, #ifdef USE_BACKREF_WITH_LEVEL exist_level, level, #endif @@ -7357,7 +7359,7 @@ parse_bag(Node** np, PToken* tok, int term, UChar** src, UChar* end, condition_is_checker = 0; r = fetch_token(tok, &p, end, env); if (r < 0) return r; - r = parse_subexp(&condition, tok, term, &p, end, env, 0); + r = parse_alts(&condition, tok, term, &p, end, env, FALSE); if (r < 0) { onig_node_free(condition); return r; @@ -7400,7 +7402,7 @@ parse_bag(Node** np, PToken* tok, int term, UChar** src, UChar* end, onig_node_free(condition); return r; } - r = parse_subexp(&target, tok, term, &p, end, env, 1); + r = parse_alts(&target, tok, term, &p, end, env, TRUE); if (r < 0) { onig_node_free(condition); onig_node_free(target); @@ -7465,7 +7467,7 @@ parse_bag(Node** np, PToken* tok, int term, UChar** src, UChar* end, return ONIGERR_GROUP_NUMBER_OVER_FOR_CAPTURE_HISTORY; } BAG_(*np)->m.regnum = num; - MEM_STATUS_ON_SIMPLE(env->capture_history, num); + MEM_STATUS_ON_SIMPLE(env->cap_history, num); } else { return ONIGERR_UNDEFINED_GROUP_OPTION; @@ -7501,7 +7503,7 @@ parse_bag(Node** np, PToken* tok, int term, UChar** src, UChar* end, case 'm': if (IS_SYNTAX_OP2(env->syntax, ONIG_SYN_OP2_OPTION_PERL)) { - OPTION_NEGATE(option, ONIG_OPTION_SINGLELINE, (neg == 0 ? 1 : 0)); + OPTION_NEGATE(option, ONIG_OPTION_SINGLELINE, (neg == 0 ? TRUE : FALSE)); } else if (IS_SYNTAX_OP2(env->syntax, ONIG_SYN_OP2_OPTION_ONIGURUMA|ONIG_SYN_OP2_OPTION_RUBY)) { @@ -7537,16 +7539,16 @@ parse_bag(Node** np, PToken* tok, int term, UChar** src, UChar* end, if (! ONIGENC_IS_UNICODE_ENCODING(enc)) return ONIGERR_UNDEFINED_GROUP_OPTION; - OPTION_NEGATE(option, ONIG_OPTION_TEXT_SEGMENT_EXTENDED_GRAPHEME_CLUSTER, 0); - OPTION_NEGATE(option, ONIG_OPTION_TEXT_SEGMENT_WORD, 1); + OPTION_NEGATE(option, ONIG_OPTION_TEXT_SEGMENT_EXTENDED_GRAPHEME_CLUSTER, FALSE); + OPTION_NEGATE(option, ONIG_OPTION_TEXT_SEGMENT_WORD, TRUE); break; #ifdef USE_UNICODE_WORD_BREAK case 'w': if (! ONIGENC_IS_UNICODE_ENCODING(enc)) return ONIGERR_UNDEFINED_GROUP_OPTION; - OPTION_NEGATE(option, ONIG_OPTION_TEXT_SEGMENT_WORD, 0); - OPTION_NEGATE(option, ONIG_OPTION_TEXT_SEGMENT_EXTENDED_GRAPHEME_CLUSTER, 1); + OPTION_NEGATE(option, ONIG_OPTION_TEXT_SEGMENT_WORD, FALSE); + OPTION_NEGATE(option, ONIG_OPTION_TEXT_SEGMENT_EXTENDED_GRAPHEME_CLUSTER, TRUE); break; #endif default: @@ -7576,7 +7578,7 @@ parse_bag(Node** np, PToken* tok, int term, UChar** src, UChar* end, env->options = option; r = fetch_token(tok, &p, end, env); if (r < 0) return r; - r = parse_subexp(&target, tok, term, &p, end, env, 0); + r = parse_alts(&target, tok, term, &p, end, env, FALSE); env->options = prev; if (r < 0) { onig_node_free(target); @@ -7623,7 +7625,7 @@ parse_bag(Node** np, PToken* tok, int term, UChar** src, UChar* end, CHECK_NULL_RETURN_MEMERR(*np); r = fetch_token(tok, &p, end, env); if (r < 0) return r; - r = parse_subexp(&target, tok, term, &p, end, env, 0); + r = parse_alts(&target, tok, term, &p, end, env, FALSE); if (r < 0) { onig_node_free(target); return r; @@ -7633,7 +7635,7 @@ parse_bag(Node** np, PToken* tok, int term, UChar** src, UChar* end, if (NODE_TYPE(*np) == NODE_BAG) { if (BAG_(*np)->type == BAG_MEMORY) { - /* Don't move this to previous of parse_subexp() */ + /* Don't move this to previous of parse_alts() */ r = scan_env_set_mem_node(env, BAG_(*np)->m.regnum, *np); if (r != 0) return r; } @@ -7653,7 +7655,7 @@ static const char* ReduceQStr[] = { }; static int -set_quantifier(Node* qnode, Node* target, int group, ScanEnv* env) +assign_quantifier_body(Node* qnode, Node* target, int group, ScanEnv* env) { QuantNode* qn; @@ -7725,9 +7727,11 @@ set_quantifier(Node* qnode, Node* target, int group, ScanEnv* env) } } else { + int r; + NODE_BODY(qnode) = target; - onig_reduce_nested_quantifier(qnode, target); - goto q_exit; + r = onig_reduce_nested_quantifier(qnode); + return r; } } break; @@ -7737,7 +7741,6 @@ set_quantifier(Node* qnode, Node* target, int group, ScanEnv* env) } NODE_BODY(qnode) = target; - q_exit: return 0; } @@ -7767,6 +7770,38 @@ clear_not_flag_cclass(CClassNode* cc, OnigEncoding enc) } #endif /* CASE_FOLD_IS_APPLIED_INSIDE_NEGATIVE_CCLASS */ +#define ADD_CODE_INTO_CC(cc, code, enc) do {\ + if (ONIGENC_MBC_MINLEN(enc) > 1 || ONIGENC_CODE_TO_MBCLEN(enc, code) != 1) {\ + add_code_range_to_buf(&((cc)->mbuf), code, code);\ + }\ + else {\ + BITSET_SET_BIT((cc)->bs, code);\ + }\ +} while (0) + +extern int +onig_new_cclass_with_code_list(Node** rnode, OnigEncoding enc, + int n, OnigCodePoint codes[]) +{ + int i; + Node* node; + CClassNode* cc; + + *rnode = NULL_NODE; + + node = node_new_cclass(); + CHECK_NULL_RETURN_MEMERR(node); + + cc = CCLASS_(node); + + for (i = 0; i < n; i++) { + ADD_CODE_INTO_CC(cc, codes[i], enc); + } + + *rnode = node; + return 0; +} + typedef struct { ScanEnv* env; CClassNode* cc; @@ -7780,37 +7815,31 @@ i_apply_case_fold(OnigCodePoint from, OnigCodePoint to[], int to_len, void* arg) IApplyCaseFoldArg* iarg; ScanEnv* env; CClassNode* cc; - BitSetRef bs; iarg = (IApplyCaseFoldArg* )arg; env = iarg->env; cc = iarg->cc; - bs = cc->bs; if (to_len == 1) { int is_in = onig_is_code_in_cc(env->enc, from, cc); #ifdef CASE_FOLD_IS_APPLIED_INSIDE_NEGATIVE_CCLASS if ((is_in != 0 && !IS_NCCLASS_NOT(cc)) || (is_in == 0 && IS_NCCLASS_NOT(cc))) { - if (ONIGENC_MBC_MINLEN(env->enc) > 1 || *to >= SINGLE_BYTE_SIZE) { - add_code_range(&(cc->mbuf), env, *to, *to); - } - else { - BITSET_SET_BIT(bs, *to); - } + ADD_CODE_INTO_CC(cc, *to, env->enc); } #else if (is_in != 0) { - if (ONIGENC_MBC_MINLEN(env->enc) > 1 || *to >= SINGLE_BYTE_SIZE) { + if (ONIGENC_MBC_MINLEN(env->enc) > 1 || + ONIGENC_CODE_TO_MBCLEN(env->enc, *to) != 1) { if (IS_NCCLASS_NOT(cc)) clear_not_flag_cclass(cc, env->enc); add_code_range(&(cc->mbuf), env, *to, *to); } else { if (IS_NCCLASS_NOT(cc)) { - BITSET_CLEAR_BIT(bs, *to); + BITSET_CLEAR_BIT(cc->bs, *to); } else - BITSET_SET_BIT(bs, *to); + BITSET_SET_BIT(cc->bs, *to); } } #endif /* CASE_FOLD_IS_APPLIED_INSIDE_NEGATIVE_CCLASS */ @@ -7818,34 +7847,65 @@ i_apply_case_fold(OnigCodePoint from, OnigCodePoint to[], int to_len, void* arg) else { int r, i, len; UChar buf[ONIGENC_CODE_TO_MBC_MAXLEN]; - Node *snode = NULL_NODE; if (onig_is_code_in_cc(env->enc, from, cc) #ifdef CASE_FOLD_IS_APPLIED_INSIDE_NEGATIVE_CCLASS && !IS_NCCLASS_NOT(cc) #endif ) { + int n, j, m, index; + Node* list_node; + Node* ns[3]; + + n = 0; for (i = 0; i < to_len; i++) { - len = ONIGENC_CODE_TO_MBC(env->enc, to[i], buf); - if (i == 0) { - snode = onig_node_new_str(buf, buf + len); - CHECK_NULL_RETURN_MEMERR(snode); - - /* char-class expanded multi-char only - compare with string folded at match time. */ - NODE_STRING_SET_AMBIG(snode); + OnigCodePoint code; + Node* csnode; + CClassNode* cs_cc; + + index = onigenc_unicode_fold1_key(&to[i]); + if (index >= 0) { + csnode = node_new_cclass(); + cs_cc = CCLASS_(csnode); + if (IS_NULL(csnode)) { + err_free_ns: + for (j = 0; j < n; j++) onig_node_free(ns[j]); + return ONIGERR_MEMORY; + } + m = FOLDS1_UNFOLDS_NUM(index); + for (j = 0; j < m; j++) { + code = FOLDS1_UNFOLDS(index)[j]; + ADD_CODE_INTO_CC(cs_cc, code, env->enc); + } + ADD_CODE_INTO_CC(cs_cc, to[i], env->enc); + ns[n++] = csnode; } else { - r = onig_node_str_cat(snode, buf, buf + len); - if (r < 0) { - onig_node_free(snode); - return r; + len = ONIGENC_CODE_TO_MBC(env->enc, to[i], buf); + if (n == 0 || NODE_TYPE(ns[n-1]) != NODE_STRING) { + csnode = onig_node_new_str(buf, buf + len); + if (IS_NULL(csnode)) goto err_free_ns; + + NODE_STRING_SET_CASE_EXPANDED(csnode); + ns[n++] = csnode; + } + else { + r = onig_node_str_cat(ns[n-1], buf, buf + len); + if (r < 0) goto err_free_ns; } } } - *(iarg->ptail) = onig_node_new_alt(snode, NULL_NODE); - CHECK_NULL_RETURN_MEMERR(*(iarg->ptail)); + if (n == 1) + list_node = ns[0]; + else + list_node = make_list(n, ns); + + *(iarg->ptail) = onig_node_new_alt(list_node, NULL_NODE); + if (IS_NULL(*(iarg->ptail))) { + onig_node_free(list_node); + return ONIGERR_MEMORY; + } iarg->ptail = &(NODE_CDR((*(iarg->ptail)))); } } @@ -7901,7 +7961,7 @@ parse_exp(Node** np, PToken* tok, int term, UChar** src, UChar* end, env->options = BAG_(*np)->o.options; r = fetch_token(tok, src, end, env); if (r < 0) return r; - r = parse_subexp(&target, tok, term, src, end, env, 0); + r = parse_alts(&target, tok, term, src, end, env, FALSE); env->options = prev; if (r < 0) { onig_node_free(target); @@ -7916,7 +7976,7 @@ parse_exp(Node** np, PToken* tok, int term, UChar** src, UChar* end, if (! IS_SYNTAX_BV(env->syntax, ONIG_SYN_ALLOW_UNMATCHED_CLOSE_SUBEXP)) return ONIGERR_UNMATCHED_CLOSE_PARENTHESIS; - if (tok->escaped) goto tk_raw_byte; + if (tok->escaped) goto tk_crude_byte; else goto tk_byte; break; @@ -7941,36 +8001,36 @@ parse_exp(Node** np, PToken* tok, int term, UChar** src, UChar* end, } break; - case TK_RAW_BYTE: - tk_raw_byte: + case TK_CRUDE_BYTE: + tk_crude_byte: { - *np = node_new_str_raw_char((UChar )tok->u.c); + *np = node_new_str_crude_char(tok->u.byte); CHECK_NULL_RETURN_MEMERR(*np); len = 1; while (1) { if (len >= ONIGENC_MBC_MINLEN(env->enc)) { if (len == enclen(env->enc, STR_(*np)->s)) { r = fetch_token(tok, src, end, env); - goto tk_raw_byte_end; + goto tk_crude_byte_end; } } r = fetch_token(tok, src, end, env); if (r < 0) return r; - if (r != TK_RAW_BYTE) + if (r != TK_CRUDE_BYTE) return ONIGERR_TOO_SHORT_MULTI_BYTE_STRING; - r = node_str_cat_char(*np, (UChar )tok->u.c); + r = node_str_cat_char(*np, tok->u.byte); if (r < 0) return r; len++; } - tk_raw_byte_end: + tk_crude_byte_end: if (! ONIGENC_IS_VALID_MBC_STRING(env->enc, STR_(*np)->s, STR_(*np)->end)) return ONIGERR_INVALID_WIDE_CHAR_VALUE; - NODE_STRING_CLEAR_RAW(*np); + NODE_STRING_CLEAR_CRUDE(*np); goto string_end; } break; @@ -7981,7 +8041,7 @@ parse_exp(Node** np, PToken* tok, int term, UChar** src, UChar* end, len = ONIGENC_CODE_TO_MBC(env->enc, tok->u.code, buf); if (len < 0) return len; #ifdef NUMBERED_CHAR_IS_NOT_CASE_AMBIG - *np = node_new_str_raw(buf, buf + len); + *np = node_new_str_crude(buf, buf + len); #else *np = node_new_str(buf, buf + len); #endif @@ -8024,7 +8084,7 @@ parse_exp(Node** np, PToken* tok, int term, UChar** src, UChar* end, *np = node_new_cclass(); CHECK_NULL_RETURN_MEMERR(*np); cc = CCLASS_(*np); - add_ctype_to_cc(cc, tok->u.prop.ctype, 0, env); + add_ctype_to_cc(cc, tok->u.prop.ctype, FALSE, env); if (tok->u.prop.not != 0) NCCLASS_SET_NOT(cc); } break; @@ -8041,11 +8101,11 @@ parse_exp(Node** np, PToken* tok, int term, UChar** src, UChar* end, if (r != 0) return r; break; - case TK_CC_OPEN: + case TK_OPEN_CC: { CClassNode* cc; - r = parse_char_class(np, tok, src, end, env); + r = parse_cc(np, tok, src, end, env); if (r != 0) return r; cc = CCLASS_(*np); @@ -8083,7 +8143,7 @@ parse_exp(Node** np, PToken* tok, int term, UChar** src, UChar* end, case TK_ANYCHAR_ANYTIME: *np = node_new_anychar(); CHECK_NULL_RETURN_MEMERR(*np); - qn = node_new_quantifier(0, INFINITE_REPEAT, 0); + qn = node_new_quantifier(0, INFINITE_REPEAT, FALSE); CHECK_NULL_RETURN_MEMERR(qn); NODE_BODY(qn) = *np; *np = qn; @@ -8186,9 +8246,7 @@ parse_exp(Node** np, PToken* tok, int term, UChar** src, UChar* end, if (is_invalid_quantifier_target(*tp)) return ONIGERR_TARGET_OF_REPEAT_OPERATOR_INVALID; - parse_depth++; - if (parse_depth > ParseDepthLimit) - return ONIGERR_PARSE_DEPTH_LIMIT_OVER; + INC_PARSE_DEPTH(parse_depth); qn = node_new_quantifier(tok->u.repeat.lower, tok->u.repeat.upper, r == TK_INTERVAL); @@ -8201,9 +8259,10 @@ parse_exp(Node** np, PToken* tok, int term, UChar** src, UChar* end, else { target = *tp; } - r = set_quantifier(qn, target, group, env); + r = assign_quantifier_body(qn, target, group, env); if (r < 0) { onig_node_free(qn); + *tp = NULL_NODE; return r; } @@ -8256,6 +8315,8 @@ parse_branch(Node** top, PToken* tok, int term, UChar** src, UChar* end, Node *node, **headp; *top = NULL; + INC_PARSE_DEPTH(env->parse_depth); + r = parse_exp(&node, tok, term, src, end, env, group_head); if (r < 0) { onig_node_free(node); @@ -8266,7 +8327,7 @@ parse_branch(Node** top, PToken* tok, int term, UChar** src, UChar* end, *top = node; } else { - *top = node_new_list(node, NULL); + *top = node_new_list(node, NULL); if (IS_NULL(*top)) { onig_node_free(node); return ONIGERR_MEMORY; @@ -8274,7 +8335,7 @@ parse_branch(Node** top, PToken* tok, int term, UChar** src, UChar* end, headp = &(NODE_CDR(*top)); while (r != TK_EOT && r != term && r != TK_ALT) { - r = parse_exp(&node, tok, term, src, end, env, 0); + r = parse_exp(&node, tok, term, src, end, env, FALSE); if (r < 0) { onig_node_free(node); return r; @@ -8292,21 +8353,20 @@ parse_branch(Node** top, PToken* tok, int term, UChar** src, UChar* end, } } + DEC_PARSE_DEPTH(env->parse_depth); return r; } /* term_tok: TK_EOT or TK_SUBEXP_CLOSE */ static int -parse_subexp(Node** top, PToken* tok, int term, UChar** src, UChar* end, - ScanEnv* env, int group_head) +parse_alts(Node** top, PToken* tok, int term, UChar** src, UChar* end, + ScanEnv* env, int group_head) { int r; Node *node, **headp; *top = NULL; - env->parse_depth++; - if (env->parse_depth > ParseDepthLimit) - return ONIGERR_PARSE_DEPTH_LIMIT_OVER; + INC_PARSE_DEPTH(env->parse_depth); r = parse_branch(&node, tok, term, src, end, env, group_head); if (r < 0) { @@ -8328,7 +8388,7 @@ parse_subexp(Node** top, PToken* tok, int term, UChar** src, UChar* end, while (r == TK_ALT) { r = fetch_token(tok, src, end, env); if (r < 0) return r; - r = parse_branch(&node, tok, term, src, end, env, 0); + r = parse_branch(&node, tok, term, src, end, env, FALSE); if (r < 0) { onig_node_free(node); return r; @@ -8355,7 +8415,7 @@ parse_subexp(Node** top, PToken* tok, int term, UChar** src, UChar* end, return ONIGERR_PARSER_BUG; } - env->parse_depth--; + DEC_PARSE_DEPTH(env->parse_depth); return r; } @@ -8367,7 +8427,7 @@ parse_regexp(Node** top, UChar** src, UChar* end, ScanEnv* env) r = fetch_token(&tok, src, end, env); if (r < 0) return r; - r = parse_subexp(top, &tok, TK_EOT, src, end, env, 0); + r = parse_alts(top, &tok, TK_EOT, src, end, env, FALSE); if (r < 0) return r; return 0; |