diff options
Diffstat (limited to 'src/regcomp.c')
-rw-r--r-- | src/regcomp.c | 253 |
1 files changed, 162 insertions, 91 deletions
diff --git a/src/regcomp.c b/src/regcomp.c index 47023cb..ab5701c 100644 --- a/src/regcomp.c +++ b/src/regcomp.c @@ -277,7 +277,7 @@ unset_addr_list_add(UnsetAddrList* list, int offset, struct _Node* node) static int add_opcode(regex_t* reg, int opcode) { - BBUF_ADD1(reg, opcode); + BB_ADD1(reg, opcode); return 0; } @@ -287,7 +287,7 @@ add_state_check_num(regex_t* reg, int num) { StateCheckNumType n = (StateCheckNumType )num; - BBUF_ADD(reg, &n, SIZE_STATE_CHECK_NUM); + BB_ADD(reg, &n, SIZE_STATE_CHECK_NUM); return 0; } #endif @@ -297,7 +297,7 @@ add_rel_addr(regex_t* reg, int addr) { RelAddrType ra = (RelAddrType )addr; - BBUF_ADD(reg, &ra, SIZE_RELADDR); + BB_ADD(reg, &ra, SIZE_RELADDR); return 0; } @@ -306,7 +306,7 @@ add_abs_addr(regex_t* reg, int addr) { AbsAddrType ra = (AbsAddrType )addr; - BBUF_ADD(reg, &ra, SIZE_ABSADDR); + BB_ADD(reg, &ra, SIZE_ABSADDR); return 0; } @@ -315,7 +315,7 @@ add_length(regex_t* reg, int len) { LengthType l = (LengthType )len; - BBUF_ADD(reg, &l, SIZE_LENGTH); + BB_ADD(reg, &l, SIZE_LENGTH); return 0; } @@ -324,7 +324,7 @@ add_mem_num(regex_t* reg, int num) { MemNumType n = (MemNumType )num; - BBUF_ADD(reg, &n, SIZE_MEMNUM); + BB_ADD(reg, &n, SIZE_MEMNUM); return 0; } @@ -334,7 +334,7 @@ add_pointer(regex_t* reg, void* addr) { PointerType ptr = (PointerType )addr; - BBUF_ADD(reg, &ptr, SIZE_POINTER); + BB_ADD(reg, &ptr, SIZE_POINTER); return 0; } #endif @@ -342,7 +342,7 @@ add_pointer(regex_t* reg, void* addr) static int add_option(regex_t* reg, OnigOptionType option) { - BBUF_ADD(reg, &option, SIZE_OPTION); + BB_ADD(reg, &option, SIZE_OPTION); return 0; } @@ -351,7 +351,7 @@ add_save_type(regex_t* reg, enum SaveType type) { SaveType t = (SaveType )type; - BBUF_ADD(reg, &t, SIZE_SAVE_TYPE); + BB_ADD(reg, &t, SIZE_SAVE_TYPE); return 0; } @@ -360,7 +360,14 @@ add_update_var_type(regex_t* reg, enum UpdateVarType type) { UpdateVarType t = (UpdateVarType )type; - BBUF_ADD(reg, &t, SIZE_UPDATE_VAR_TYPE); + BB_ADD(reg, &t, SIZE_UPDATE_VAR_TYPE); + return 0; +} + +static int +add_mode(regex_t* reg, ModeType mode) +{ + BB_ADD(reg, &mode, SIZE_MODE); return 0; } @@ -378,14 +385,14 @@ add_opcode_rel_addr(regex_t* reg, int opcode, int addr) static int add_bytes(regex_t* reg, UChar* bytes, int len) { - BBUF_ADD(reg, bytes, len); + BB_ADD(reg, bytes, len); return 0; } static int add_bitset(regex_t* reg, BitSetRef bs) { - BBUF_ADD(reg, bs, SIZE_BITSET); + BB_ADD(reg, bs, SIZE_BITSET); return 0; } @@ -492,7 +499,7 @@ compile_call(CallNode* node, regex_t* reg, ScanEnv* env) r = add_opcode(reg, OP_CALL); if (r != 0) return r; - r = unset_addr_list_add(env->unset_addr_list, BBUF_GET_OFFSET_POS(reg), + r = unset_addr_list_add(env->unset_addr_list, BB_GET_OFFSET_POS(reg), NODE_CALL_BODY(node)); if (r != 0) return r; r = add_abs_addr(reg, 0 /*dummy addr.*/); @@ -655,7 +662,7 @@ add_multi_byte_cclass(BBuf* mbuf, regex_t* reg) return add_bytes(reg, mbuf->p, mbuf->used); #else int r, pad_size; - UChar* p = BBUF_GET_ADD_ADDRESS(reg) + SIZE_LENGTH; + UChar* p = BB_GET_ADD_ADDRESS(reg) + SIZE_LENGTH; GET_ALIGNMENT_PAD_SIZE(p, pad_size); add_length(reg, mbuf->used + (WORD_ALIGNMENT_SIZE - 1)); @@ -1400,7 +1407,7 @@ compile_enclosure_memory_node(EnclosureNode* node, regex_t* reg, ScanEnv* env) if (node->m.regnum == 0 && NODE_IS_CALLED(node)) { r = add_opcode(reg, OP_CALL); if (r != 0) return r; - node->m.called_addr = BBUF_GET_OFFSET_POS(reg) + SIZE_ABSADDR + SIZE_OP_JUMP; + node->m.called_addr = BB_GET_OFFSET_POS(reg) + SIZE_ABSADDR + SIZE_OP_JUMP; NODE_STATUS_ADD(node, NST_ADDR_FIXED); r = add_abs_addr(reg, (int )node->m.called_addr); if (r != 0) return r; @@ -1418,7 +1425,7 @@ compile_enclosure_memory_node(EnclosureNode* node, regex_t* reg, ScanEnv* env) if (NODE_IS_CALLED(node)) { r = add_opcode(reg, OP_CALL); if (r != 0) return r; - node->m.called_addr = BBUF_GET_OFFSET_POS(reg) + SIZE_ABSADDR + SIZE_OP_JUMP; + node->m.called_addr = BB_GET_OFFSET_POS(reg) + SIZE_ABSADDR + SIZE_OP_JUMP; NODE_STATUS_ADD(node, NST_ADDR_FIXED); r = add_abs_addr(reg, (int )node->m.called_addr); if (r != 0) return r; @@ -1588,6 +1595,20 @@ compile_length_anchor_node(AnchorNode* node, regex_t* reg) len = SIZE_OP_PUSH_LOOK_BEHIND_NOT + tlen + SIZE_OP_FAIL_LOOK_BEHIND_NOT; break; + case ANCHOR_WORD_BOUNDARY: + case ANCHOR_NO_WORD_BOUNDARY: +#ifdef USE_WORD_BEGIN_END + case ANCHOR_WORD_BEGIN: + case ANCHOR_WORD_END: +#endif + len = SIZE_OP_WORD_BOUNDARY; + break; + + case ANCHOR_EXTENDED_GRAPHEME_CLUSTER_BOUNDARY: + case ANCHOR_NO_EXTENDED_GRAPHEME_CLUSTER_BOUNDARY: + len = SIZE_OPCODE; + break; + default: len = SIZE_OPCODE; break; @@ -1600,6 +1621,7 @@ static int compile_anchor_node(AnchorNode* node, regex_t* reg, ScanEnv* env) { int r, len; + enum OpCode op; switch (node->type) { case ANCHOR_BEGIN_BUF: r = add_opcode(reg, OP_BEGIN_BUF); break; @@ -1609,13 +1631,34 @@ compile_anchor_node(AnchorNode* node, regex_t* reg, ScanEnv* env) case ANCHOR_SEMI_END_BUF: r = add_opcode(reg, OP_SEMI_END_BUF); break; case ANCHOR_BEGIN_POSITION: r = add_opcode(reg, OP_BEGIN_POSITION); break; - case ANCHOR_WORD_BOUND: r = add_opcode(reg, OP_WORD_BOUND); break; - case ANCHOR_NOT_WORD_BOUND: r = add_opcode(reg, OP_NOT_WORD_BOUND); break; + case ANCHOR_WORD_BOUNDARY: + op = OP_WORD_BOUNDARY; + word: + r = add_opcode(reg, op); + if (r != 0) return r; + r = add_mode(reg, (ModeType )node->ascii_mode); + break; + + case ANCHOR_NO_WORD_BOUNDARY: + op = OP_NO_WORD_BOUNDARY; goto word; + break; #ifdef USE_WORD_BEGIN_END - case ANCHOR_WORD_BEGIN: r = add_opcode(reg, OP_WORD_BEGIN); break; - case ANCHOR_WORD_END: r = add_opcode(reg, OP_WORD_END); break; + case ANCHOR_WORD_BEGIN: + op = OP_WORD_BEGIN; goto word; + break; + case ANCHOR_WORD_END: + op = OP_WORD_END; goto word; + break; #endif + case ANCHOR_EXTENDED_GRAPHEME_CLUSTER_BOUNDARY: + r = add_opcode(reg, OP_EXTENDED_GRAPHEME_CLUSTER_BOUNDARY); + break; + + case ANCHOR_NO_EXTENDED_GRAPHEME_CLUSTER_BOUNDARY: + r = add_opcode(reg, OP_NO_EXTENDED_GRAPHEME_CLUSTER_BOUNDARY); + break; + case ANCHOR_PREC_READ: r = add_opcode(reg, OP_PREC_READ_START); if (r != 0) return r; @@ -1914,9 +1957,12 @@ compile_tree(Node* node, regex_t* reg, ScanEnv* env) break; case ONIGENC_CTYPE_WORD: - if (CTYPE_(node)->not != 0) op = OP_NOT_WORD; - else op = OP_WORD; - + if (CTYPE_(node)->ascii_mode == 0) { + op = CTYPE_(node)->not != 0 ? OP_NO_WORD : OP_WORD; + } + else { + op = CTYPE_(node)->not != 0 ? OP_NO_WORD_ASCII : OP_WORD_ASCII; + } r = add_opcode(reg, op); break; @@ -2038,8 +2084,6 @@ compile_tree(Node* node, regex_t* reg, ScanEnv* env) return r; } -#ifdef USE_NAMED_GROUP - static int noname_disable_map(Node** plink, GroupNumRemap* map, int* counter) { @@ -2283,7 +2327,6 @@ disable_noname_group_capture(Node** root, regex_t* reg, ScanEnv* env) return onig_renumber_name_table(reg, map); } -#endif /* USE_NAMED_GROUP */ #ifdef USE_CALL static int @@ -2301,7 +2344,7 @@ unset_addr_list_fix(UnsetAddrList* uslist, regex_t* reg) addr = en->m.called_addr; offset = uslist->us[i].offset; - BBUF_WRITE(reg, offset, &addr, SIZE_ABSADDR); + BB_WRITE(reg, offset, &addr, SIZE_ABSADDR); } return 0; } @@ -2394,9 +2437,6 @@ get_char_length_tree1(Node* node, regex_t* reg, int* len, int level) #endif case NODE_CTYPE: - *len = 1; - break; - case NODE_CCLASS: *len = 1; break; @@ -2496,7 +2536,8 @@ is_exclusive(Node* x, Node* y, regex_t* reg) switch (ytype) { case NODE_CTYPE: if (CTYPE_(y)->ctype == CTYPE_(x)->ctype && - CTYPE_(y)->not != CTYPE_(x)->not) + CTYPE_(y)->not != CTYPE_(x)->not && + CTYPE_(y)->ascii_mode == CTYPE_(x)->ascii_mode) return 1; else return 0; @@ -2523,6 +2564,7 @@ is_exclusive(Node* x, Node* y, regex_t* reg) case NODE_CCLASS: { + int range; CClassNode* xc = CCLASS_(x); switch (ytype) { case NODE_CTYPE: @@ -2534,9 +2576,10 @@ is_exclusive(Node* x, Node* y, regex_t* reg) case ONIGENC_CTYPE_WORD: if (CTYPE_(y)->not == 0) { if (IS_NULL(xc->mbuf) && !IS_NCCLASS_NOT(xc)) { - for (i = 0; i < SINGLE_BYTE_SIZE; i++) { + range = CTYPE_(y)->ascii_mode != 0 ? 128 : SINGLE_BYTE_SIZE; + for (i = 0; i < range; i++) { if (BITSET_AT(xc->bs, i)) { - if (IS_CODE_SB_WORD(reg->enc, i)) return 0; + if (ONIGENC_IS_CODE_WORD(reg->enc, i)) return 0; } } return 1; @@ -2545,18 +2588,18 @@ is_exclusive(Node* x, Node* y, regex_t* reg) } else { if (IS_NOT_NULL(xc->mbuf)) return 0; - for (i = 0; i < SINGLE_BYTE_SIZE; i++) { - if (! IS_CODE_SB_WORD(reg->enc, i)) { - if (!IS_NCCLASS_NOT(xc)) { - if (BITSET_AT(xc->bs, i)) - return 0; - } - else { - if (! BITSET_AT(xc->bs, i)) - return 0; - } + if (IS_NCCLASS_NOT(xc)) return 0; + + range = CTYPE_(y)->ascii_mode != 0 ? 128 : SINGLE_BYTE_SIZE; + for (i = 0; i < range; i++) { + if (! ONIGENC_IS_CODE_WORD(reg->enc, i)) { + if (BITSET_AT(xc->bs, i)) + return 0; } } + for (i = range; i < SINGLE_BYTE_SIZE; i++) { + if (BITSET_AT(xc->bs, i)) return 0; + } return 1; } break; @@ -2612,10 +2655,18 @@ is_exclusive(Node* x, Node* y, regex_t* reg) break; case ONIGENC_CTYPE_WORD: - if (ONIGENC_IS_MBC_WORD(reg->enc, xs->s, xs->end)) - return CTYPE_(y)->not; - else - return !(CTYPE_(y)->not); + if (CTYPE_(y)->ascii_mode == 0) { + if (ONIGENC_IS_MBC_WORD(reg->enc, xs->s, xs->end)) + return CTYPE_(y)->not; + else + return !(CTYPE_(y)->not); + } + else { + if (ONIGENC_IS_MBC_WORD_ASCII(reg->enc, xs->s, xs->end)) + return CTYPE_(y)->not; + else + return !(CTYPE_(y)->not); + } break; default: break; @@ -2780,7 +2831,7 @@ check_type_tree(Node* node, int type_mask, int enclosure_mask, int anchor_mask) case NODE_ENCLOSURE: { EnclosureNode* en = ENCLOSURE_(node); - if ((en->type & enclosure_mask) == 0) + if (((1<<en->type) & enclosure_mask) == 0) return 1; r = check_type_tree(NODE_BODY(node), type_mask, enclosure_mask, anchor_mask); @@ -3512,7 +3563,7 @@ divide_look_behind_alternatives(Node* node) np = node; while (IS_NOT_NULL(np = NODE_CDR(np))) { - insert_node = onig_node_new_anchor(anc_type); + insert_node = onig_node_new_anchor(anc_type, an->ascii_mode); CHECK_NULL_RETURN_MEMERR(insert_node); NODE_BODY(insert_node) = NODE_CAR(np); NODE_CAR(np) = insert_node; @@ -4150,22 +4201,19 @@ setup_call_node_call(CallNode* cn, ScanEnv* env, int state) if (cn->by_number != 0) { int gnum = cn->group_num; -#ifdef USE_NAMED_GROUP if (env->num_named > 0 && IS_SYNTAX_BV(env->syntax, ONIG_SYN_CAPTURE_ONLY_NAMED_GROUP) && !ONIG_IS_OPTION_ON(env->options, ONIG_OPTION_CAPTURE_GROUP)) { return ONIGERR_NUMBERED_BACKREF_OR_CALL_NOT_ALLOWED; } -#endif + if (gnum > env->num_mem) { onig_scan_env_set_error_string(env, ONIGERR_UNDEFINED_GROUP_REFERENCE, cn->name, cn->name_end); return ONIGERR_UNDEFINED_GROUP_REFERENCE; } -#ifdef USE_NAMED_GROUP set_call_attr: -#endif NODE_CALL_BODY(cn) = mem_env[cn->group_num].node; if (IS_NULL(NODE_CALL_BODY(cn))) { onig_scan_env_set_error_string(env, ONIGERR_UNDEFINED_NAME_REFERENCE, @@ -4173,7 +4221,6 @@ setup_call_node_call(CallNode* cn, ScanEnv* env, int state) return ONIGERR_UNDEFINED_NAME_REFERENCE; } } -#ifdef USE_NAMED_GROUP else { int *refs; @@ -4193,7 +4240,6 @@ setup_call_node_call(CallNode* cn, ScanEnv* env, int state) goto set_call_attr; } } -#endif return 0; } @@ -4579,18 +4625,22 @@ setup_anchor(Node* node, regex_t* reg, int state, ScanEnv* env) | BIT_NODE_CTYPE | BIT_NODE_ANCHOR | BIT_NODE_ENCLOSURE | BIT_NODE_QUANT \ | BIT_NODE_CALL ) -#define ALLOWED_ENCLOSURE_IN_LB ( ENCLOSURE_MEMORY | ENCLOSURE_OPTION ) -#define ALLOWED_ENCLOSURE_IN_LB_NOT ENCLOSURE_OPTION +#define ALLOWED_ENCLOSURE_IN_LB ( 1<<ENCLOSURE_MEMORY | 1<<ENCLOSURE_OPTION ) +#define ALLOWED_ENCLOSURE_IN_LB_NOT (1<<ENCLOSURE_OPTION) #define ALLOWED_ANCHOR_IN_LB \ ( ANCHOR_LOOK_BEHIND | ANCHOR_BEGIN_LINE | ANCHOR_END_LINE | ANCHOR_BEGIN_BUF \ - | ANCHOR_BEGIN_POSITION | ANCHOR_WORD_BOUND | ANCHOR_NOT_WORD_BOUND \ - | ANCHOR_WORD_BEGIN | ANCHOR_WORD_END ) + | ANCHOR_BEGIN_POSITION | ANCHOR_WORD_BOUNDARY | ANCHOR_NO_WORD_BOUNDARY \ + | ANCHOR_WORD_BEGIN | ANCHOR_WORD_END \ + | ANCHOR_EXTENDED_GRAPHEME_CLUSTER_BOUNDARY \ + | ANCHOR_NO_EXTENDED_GRAPHEME_CLUSTER_BOUNDARY ) #define ALLOWED_ANCHOR_IN_LB_NOT \ ( ANCHOR_LOOK_BEHIND | ANCHOR_LOOK_BEHIND_NOT | ANCHOR_BEGIN_LINE \ - | ANCHOR_END_LINE | ANCHOR_BEGIN_BUF | ANCHOR_BEGIN_POSITION | ANCHOR_WORD_BOUND \ - | ANCHOR_NOT_WORD_BOUND | ANCHOR_WORD_BEGIN | ANCHOR_WORD_END ) + | ANCHOR_END_LINE | ANCHOR_BEGIN_BUF | ANCHOR_BEGIN_POSITION | ANCHOR_WORD_BOUNDARY \ + | ANCHOR_NO_WORD_BOUNDARY | ANCHOR_WORD_BEGIN | ANCHOR_WORD_END \ + | ANCHOR_EXTENDED_GRAPHEME_CLUSTER_BOUNDARY \ + | ANCHOR_NO_EXTENDED_GRAPHEME_CLUSTER_BOUNDARY ) int r; AnchorNode* an = ANCHOR_(node); @@ -5603,6 +5653,7 @@ optimize_node_left(Node* node, NodeOptInfo* opt, OptEnv* env) case NODE_CTYPE: { int i, min, max; + int range; max = ONIGENC_MBC_MAXLEN_DIST(env->enc); @@ -5614,15 +5665,19 @@ optimize_node_left(Node* node, NodeOptInfo* opt, OptEnv* env) break; case ONIGENC_CTYPE_WORD: + range = CTYPE_(node)->ascii_mode != 0 ? 128 : SINGLE_BYTE_SIZE; if (CTYPE_(node)->not != 0) { - for (i = 0; i < SINGLE_BYTE_SIZE; i++) { + for (i = 0; i < range; i++) { if (! ONIGENC_IS_CODE_WORD(env->enc, i)) { add_char_opt_map_info(&opt->map, (UChar )i, env->enc); } } + for (i = range; i < SINGLE_BYTE_SIZE; i++) { + add_char_opt_map_info(&opt->map, (UChar )i, env->enc); + } } else { - for (i = 0; i < SINGLE_BYTE_SIZE; i++) { + for (i = 0; i < range; i++) { if (ONIGENC_IS_CODE_WORD(env->enc, i)) { add_char_opt_map_info(&opt->map, (UChar )i, env->enc); } @@ -6171,9 +6226,7 @@ onig_free_body(regex_t* reg) if (IS_NOT_NULL(reg->repeat_range)) xfree(reg->repeat_range); if (IS_NOT_NULL(REG_EXTP(reg))) xfree(REG_EXTP(reg)); -#ifdef USE_NAMED_GROUP onig_names_free(reg); -#endif } } @@ -6202,7 +6255,7 @@ onig_transfer(regex_t* to, regex_t* from) #ifdef ONIG_DEBUG_COMPILE static void print_compiled_byte_code_list P_((FILE* f, regex_t* reg)); #endif -#ifdef ONIG_DEBUG_PARSE_TREE +#ifdef ONIG_DEBUG_PARSE static void print_tree P_((FILE* f, Node* node)); #endif @@ -6229,7 +6282,7 @@ onig_compile(regex_t* reg, const UChar* pattern, const UChar* pattern_end, if (reg->alloc == 0) { init_size = (pattern_end - pattern) * 2; if (init_size <= 0) init_size = COMPILE_INIT_SIZE; - r = BBUF_INIT(reg, init_size); + r = BB_INIT(reg, init_size); if (r != 0) goto end; } else @@ -6247,7 +6300,6 @@ onig_compile(regex_t* reg, const UChar* pattern, const UChar* pattern_end, r = onig_parse_tree(&root, pattern, pattern_end, reg, &scan_env); if (r != 0) goto err; -#ifdef USE_NAMED_GROUP /* mixed use named group and no-named group */ if (scan_env.num_named > 0 && IS_SYNTAX_BV(scan_env.syntax, ONIG_SYN_CAPTURE_ONLY_NAMED_GROUP) && @@ -6259,7 +6311,6 @@ onig_compile(regex_t* reg, const UChar* pattern, const UChar* pattern_end, if (r != 0) goto err; } -#endif r = check_backrefs(root, &scan_env); if (r != 0) goto err; @@ -6287,7 +6338,7 @@ onig_compile(regex_t* reg, const UChar* pattern, const UChar* pattern_end, r = setup_tree(root, reg, 0, &scan_env); if (r != 0) goto err_unset; -#ifdef ONIG_DEBUG_PARSE_TREE +#ifdef ONIG_DEBUG_PARSE print_tree(stderr, root); #endif @@ -6377,9 +6428,7 @@ onig_compile(regex_t* reg, const UChar* pattern, const UChar* pattern_end, onig_node_free(root); #ifdef ONIG_DEBUG_COMPILE -#ifdef USE_NAMED_GROUP onig_print_names(stderr, reg); -#endif print_compiled_byte_code_list(stderr, reg); #endif @@ -6642,6 +6691,7 @@ onig_is_code_in_cc(OnigEncoding enc, OnigCodePoint code, CClassNode* cc) #define ARG_MEMNUM 4 #define ARG_OPTION 5 #define ARG_STATE_CHECK 6 +#define ARG_MODE 7 OnigOpInfoType OnigOpInfo[] = { { OP_FINISH, "finish", ARG_NON }, @@ -6666,7 +6716,9 @@ OnigOpInfoType OnigOpInfo[] = { { OP_CCLASS_NOT, "cclass-not", ARG_SPECIAL }, { OP_CCLASS_MB_NOT, "cclass-mb-not", ARG_SPECIAL }, { OP_CCLASS_MIX_NOT, "cclass-mix-not", ARG_SPECIAL }, +#ifdef USE_OP_CCLASS_NODE { OP_CCLASS_NODE, "cclass-node", ARG_SPECIAL }, +#endif { OP_ANYCHAR, "anychar", ARG_NON }, { OP_ANYCHAR_ML, "anychar-ml", ARG_NON }, { OP_ANYCHAR_STAR, "anychar*", ARG_NON }, @@ -6674,11 +6726,13 @@ OnigOpInfoType OnigOpInfo[] = { { OP_ANYCHAR_STAR_PEEK_NEXT, "anychar*-peek-next", ARG_SPECIAL }, { OP_ANYCHAR_ML_STAR_PEEK_NEXT, "anychar-ml*-peek-next", ARG_SPECIAL }, { OP_WORD, "word", ARG_NON }, - { OP_NOT_WORD, "not-word", ARG_NON }, - { OP_WORD_BOUND, "word-bound", ARG_NON }, - { OP_NOT_WORD_BOUND, "not-word-bound", ARG_NON }, - { OP_WORD_BEGIN, "word-begin", ARG_NON }, - { OP_WORD_END, "word-end", ARG_NON }, + { OP_WORD_ASCII, "word-ascii", ARG_NON }, + { OP_NO_WORD, "not-word", ARG_NON }, + { OP_NO_WORD_ASCII, "not-word-ascii", ARG_NON }, + { OP_WORD_BOUNDARY, "word-boundary", ARG_MODE }, + { OP_NO_WORD_BOUNDARY, "not-word-boundary", ARG_MODE }, + { OP_WORD_BEGIN, "word-begin", ARG_MODE }, + { OP_WORD_END, "word-end", ARG_MODE }, { OP_BEGIN_BUF, "begin-buf", ARG_NON }, { OP_END_BUF, "end-buf", ARG_NON }, { OP_BEGIN_LINE, "begin-line", ARG_NON }, @@ -6800,6 +6854,7 @@ onig_print_compiled_byte_code(FILE* f, UChar* bp, UChar** nextp, UChar* start, StateCheckNumType scn; OnigCodePoint code; OnigOptionType option; + ModeType mode; UChar *q; fprintf(f, "%s", op2name(*bp)); @@ -6840,6 +6895,12 @@ onig_print_compiled_byte_code(FILE* f, UChar* bp, UChar** nextp, UChar* start, bp += SIZE_STATE_CHECK_NUM; fprintf(f, ":%d", scn); break; + + case ARG_MODE: + mode = *((ModeType* )bp); + bp += SIZE_MODE; + fprintf(f, ":%d", mode); + break; } } else { @@ -6939,6 +7000,7 @@ onig_print_compiled_byte_code(FILE* f, UChar* bp, UChar** nextp, UChar* start, fprintf(f, ":%d:%d:%d", n, (int )code, len); break; +#ifdef USE_OP_CCLASS_NODE case OP_CCLASS_NODE: { CClassNode *cc; @@ -6948,6 +7010,7 @@ onig_print_compiled_byte_code(FILE* f, UChar* bp, UChar** nextp, UChar* start, fprintf(f, ":%p:%d", cc, n); } break; +#endif case OP_BACKREF_N_IC: mem = *((MemNumType* )bp); @@ -7082,7 +7145,7 @@ print_compiled_byte_code_list(FILE* f, regex_t* reg) } #endif -#ifdef ONIG_DEBUG_PARSE_TREE +#ifdef ONIG_DEBUG_PARSE static void Indent(FILE* f, int indent) @@ -7157,9 +7220,13 @@ print_indent_tree(FILE* f, Node* node, int indent) case ONIGENC_CTYPE_WORD: if (CTYPE_(node)->not != 0) - fputs("not word", f); + fputs("not word", f); else - fputs("word", f); + fputs("word", f); + + if (CTYPE_(node)->ascii_mode != 0) + fputs(" (ascii)", f); + break; default: @@ -7171,19 +7238,23 @@ print_indent_tree(FILE* f, Node* node, int indent) case NODE_ANCHOR: fprintf(f, "<anchor:%p> ", node); switch (ANCHOR_(node)->type) { - case ANCHOR_BEGIN_BUF: fputs("begin buf", f); break; - case ANCHOR_END_BUF: fputs("end buf", f); break; - case ANCHOR_BEGIN_LINE: fputs("begin line", f); break; - case ANCHOR_END_LINE: fputs("end line", f); break; - case ANCHOR_SEMI_END_BUF: fputs("semi end buf", f); break; - case ANCHOR_BEGIN_POSITION: fputs("begin position", f); break; - - case ANCHOR_WORD_BOUND: fputs("word bound", f); break; - case ANCHOR_NOT_WORD_BOUND: fputs("not word bound", f); break; + case ANCHOR_BEGIN_BUF: fputs("begin buf", f); break; + case ANCHOR_END_BUF: fputs("end buf", f); break; + case ANCHOR_BEGIN_LINE: fputs("begin line", f); break; + case ANCHOR_END_LINE: fputs("end line", f); break; + case ANCHOR_SEMI_END_BUF: fputs("semi end buf", f); break; + case ANCHOR_BEGIN_POSITION: fputs("begin position", f); break; + + case ANCHOR_WORD_BOUNDARY: fputs("word boundary", f); break; + case ANCHOR_NO_WORD_BOUNDARY: fputs("not word boundary", f); break; #ifdef USE_WORD_BEGIN_END - case ANCHOR_WORD_BEGIN: fputs("word begin", f); break; - case ANCHOR_WORD_END: fputs("word end", f); break; + case ANCHOR_WORD_BEGIN: fputs("word begin", f); break; + case ANCHOR_WORD_END: fputs("word end", f); break; #endif + case ANCHOR_EXTENDED_GRAPHEME_CLUSTER_BOUNDARY: + fputs("extended-grapheme-cluster boundary", f); break; + case ANCHOR_NO_EXTENDED_GRAPHEME_CLUSTER_BOUNDARY: + fputs("no-extended-grapheme-cluster boundary", f); break; case ANCHOR_PREC_READ: fprintf(f, "prec read\n"); print_indent_tree(f, NODE_BODY(node), indent + add); |