diff options
Diffstat (limited to 'src/regint.h')
-rw-r--r-- | src/regint.h | 316 |
1 files changed, 253 insertions, 63 deletions
diff --git a/src/regint.h b/src/regint.h index d6aec9d..56767e8 100644 --- a/src/regint.h +++ b/src/regint.h @@ -4,7 +4,7 @@ regint.h - Oniguruma (regular expression library) **********************************************************************/ /*- - * Copyright (c) 2002-2018 K.Kosako <sndgk393 AT ybb DOT ne DOT jp> + * Copyright (c) 2002-2019 K.Kosako <sndgk393 AT ybb DOT ne DOT jp> * All rights reserved. * * Redistribution and use in source and binary forms, with or without @@ -54,6 +54,10 @@ #define PLATFORM_UNALIGNED_WORD_ACCESS #endif +#ifdef __GNUC__ +#define USE_GOTO_LABELS_AS_VALUES +#endif + /* config */ /* spec. config */ #define USE_CALL @@ -63,6 +67,10 @@ #define USE_NEWLINE_AT_END_OF_STRING_HAS_EMPTY_LINE /* /\n$/ =~ "\n" */ #define USE_WARNING_REDUNDANT_NESTED_REPEAT_OPERATOR #define USE_RETRY_LIMIT_IN_MATCH +#ifdef USE_GOTO_LABELS_AS_VALUES +#define USE_THREADED_CODE +#define USE_DIRECT_THREADED_CODE +#endif /* internal config */ #define USE_OP_PUSH_OR_JUMP_EXACT @@ -251,57 +259,6 @@ typedef struct { #endif -typedef struct { - const UChar* pattern; - const UChar* pattern_end; -#ifdef USE_CALLOUT - void* tag_table; - int callout_num; - int callout_list_alloc; - CalloutListEntry* callout_list; /* index: callout num */ -#endif -} RegexExt; - -struct re_pattern_buffer { - /* common members of BBuf(bytes-buffer) */ - unsigned char* p; /* compiled pattern */ - unsigned int used; /* used space for p */ - unsigned int alloc; /* allocated space for p */ - - int num_mem; /* used memory(...) num counted from 1 */ - int num_repeat; /* OP_REPEAT/OP_REPEAT_NG id-counter */ - int num_null_check; /* OP_EMPTY_CHECK_START/END id counter */ - int num_call; /* number of subexp call */ - unsigned int capture_history; /* (?@...) flag (1-31) */ - unsigned int bt_mem_start; /* need backtrack flag */ - unsigned int bt_mem_end; /* need backtrack flag */ - int stack_pop_level; - int repeat_range_alloc; - OnigRepeatRange* repeat_range; - - OnigEncoding enc; - OnigOptionType options; - OnigSyntaxType* syntax; - OnigCaseFoldType case_fold_flag; - void* name_table; - - /* optimization info (string search, char-map and anchors) */ - int optimize; /* optimize flag */ - int threshold_len; /* search str-length for apply optimize */ - int anchor; /* BEGIN_BUF, BEGIN_POS, (SEMI_)END_BUF */ - OnigLen anchor_dmin; /* (SEMI_)END_BUF anchor distance */ - OnigLen anchor_dmax; /* (SEMI_)END_BUF anchor distance */ - int sub_anchor; /* start-anchor for exact or map */ - unsigned char *exact; - unsigned char *exact_end; - unsigned char map[CHAR_MAP_SIZE]; /* used as BMH skip or char-map */ - int map_offset; - OnigLen dmin; /* min-distance of exact or map */ - OnigLen dmax; /* max-distance of exact or map */ - RegexExt* extp; -}; - - /* stack pop level */ enum StackPopLevel { STACK_POP_LEVEL_FREE = 0, @@ -430,7 +387,7 @@ typedef struct _BBuf { unsigned int alloc; } BBuf; -#define BB_INIT(buf,size) onig_bbuf_init((BBuf* )(buf), (size)) +#define BB_INIT(buf,size) bbuf_init((BBuf* )(buf), (size)) #define BB_SIZE_INC(buf,inc) do{\ (buf)->alloc += (inc);\ @@ -522,8 +479,8 @@ typedef struct _BBuf { #define ANCR_WORD_END (1<<13) #define ANCR_ANYCHAR_INF (1<<14) #define ANCR_ANYCHAR_INF_ML (1<<15) -#define ANCR_EXTENDED_GRAPHEME_CLUSTER_BOUNDARY (1<<16) -#define ANCR_NO_EXTENDED_GRAPHEME_CLUSTER_BOUNDARY (1<<17) +#define ANCR_TEXT_SEGMENT_BOUNDARY (1<<16) +#define ANCR_NO_TEXT_SEGMENT_BOUNDARY (1<<17) #define ANCHOR_HAS_BODY(a) ((a)->type < ANCR_BEGIN_BUF) @@ -559,9 +516,6 @@ enum OpCode { OP_CCLASS_NOT, OP_CCLASS_MB_NOT, OP_CCLASS_MIX_NOT, -#ifdef USE_OP_CCLASS_NODE - OP_CCLASS_NODE, /* pointer to CClassNode node */ -#endif OP_ANYCHAR, /* "." */ OP_ANYCHAR_ML, /* "." multi-line */ @@ -579,8 +533,7 @@ enum OpCode { OP_WORD_BEGIN, OP_WORD_END, - OP_EXTENDED_GRAPHEME_CLUSTER_BOUNDARY, - OP_NO_EXTENDED_GRAPHEME_CLUSTER_BOUNDARY, + OP_TEXT_SEGMENT_BOUNDARY, OP_BEGIN_BUF, OP_END_BUF, @@ -596,6 +549,7 @@ enum OpCode { OP_BACKREF_MULTI, OP_BACKREF_MULTI_IC, OP_BACKREF_WITH_LEVEL, /* \k<xxx+n>, \k<xxx-n> */ + OP_BACKREF_WITH_LEVEL_IC, /* \k<xxx+n>, \k<xxx-n> */ OP_BACKREF_CHECK, /* (?(n)), (?('name')) */ OP_BACKREF_CHECK_WITH_LEVEL, /* (?(n-level)), (?('name-level')) */ @@ -611,7 +565,9 @@ enum OpCode { OP_PUSH, OP_PUSH_SUPER, OP_POP_OUT, +#ifdef USE_OP_PUSH_OR_JUMP_EXACT OP_PUSH_OR_JUMP_EXACT1, /* if match exact then push, else jump. */ +#endif OP_PUSH_IF_PEEK_NEXT, /* if match exact then push, else none. */ OP_REPEAT, /* {n,m} */ OP_REPEAT_NG, /* {n,m}? (non greedy) */ @@ -658,6 +614,11 @@ enum UpdateVarType { UPDATE_VAR_RIGHT_RANGE_INIT = 4, }; +enum TextSegmentBoundaryType { + EXTENDED_GRAPHEME_CLUSTER_BOUNDARY = 0, + WORD_BOUNDARY = 1, +}; + typedef int RelAddrType; typedef int AbsAddrType; typedef int LengthType; @@ -701,13 +662,16 @@ typedef int ModeType; /* op-code + arg size */ +#if 0 #define SIZE_OP_ANYCHAR_STAR SIZE_OPCODE #define SIZE_OP_ANYCHAR_STAR_PEEK_NEXT (SIZE_OPCODE + 1) #define SIZE_OP_JUMP (SIZE_OPCODE + SIZE_RELADDR) #define SIZE_OP_PUSH (SIZE_OPCODE + SIZE_RELADDR) #define SIZE_OP_PUSH_SUPER (SIZE_OPCODE + SIZE_RELADDR) #define SIZE_OP_POP_OUT SIZE_OPCODE +#ifdef USE_OP_PUSH_OR_JUMP_EXACT #define SIZE_OP_PUSH_OR_JUMP_EXACT1 (SIZE_OPCODE + SIZE_RELADDR + 1) +#endif #define SIZE_OP_PUSH_IF_PEEK_NEXT (SIZE_OPCODE + SIZE_RELADDR + 1) #define SIZE_OP_REPEAT_INC (SIZE_OPCODE + SIZE_MEMNUM) #define SIZE_OP_REPEAT_INC_NG (SIZE_OPCODE + SIZE_MEMNUM) @@ -740,6 +704,56 @@ typedef int ModeType; #define SIZE_OP_CALLOUT_NAME (SIZE_OPCODE + SIZE_MEMNUM + SIZE_MEMNUM) #endif +#else /* if 0 */ + +/* for relative address increment to go next op. */ +#define SIZE_INC_OP 1 + +#define SIZE_OP_ANYCHAR_STAR 1 +#define SIZE_OP_ANYCHAR_STAR_PEEK_NEXT 1 +#define SIZE_OP_JUMP 1 +#define SIZE_OP_PUSH 1 +#define SIZE_OP_PUSH_SUPER 1 +#define SIZE_OP_POP_OUT 1 +#ifdef USE_OP_PUSH_OR_JUMP_EXACT +#define SIZE_OP_PUSH_OR_JUMP_EXACT1 1 +#endif +#define SIZE_OP_PUSH_IF_PEEK_NEXT 1 +#define SIZE_OP_REPEAT 1 +#define SIZE_OP_REPEAT_INC 1 +#define SIZE_OP_REPEAT_INC_NG 1 +#define SIZE_OP_WORD_BOUNDARY 1 +#define SIZE_OP_PREC_READ_START 1 +#define SIZE_OP_PREC_READ_NOT_START 1 +#define SIZE_OP_PREC_READ_END 1 +#define SIZE_OP_PREC_READ_NOT_END 1 +#define SIZE_OP_BACKREF 1 +#define SIZE_OP_FAIL 1 +#define SIZE_OP_MEMORY_START 1 +#define SIZE_OP_MEMORY_START_PUSH 1 +#define SIZE_OP_MEMORY_END_PUSH 1 +#define SIZE_OP_MEMORY_END_PUSH_REC 1 +#define SIZE_OP_MEMORY_END 1 +#define SIZE_OP_MEMORY_END_REC 1 +#define SIZE_OP_ATOMIC_START 1 +#define SIZE_OP_ATOMIC_END 1 +#define SIZE_OP_EMPTY_CHECK_START 1 +#define SIZE_OP_EMPTY_CHECK_END 1 +#define SIZE_OP_LOOK_BEHIND 1 +#define SIZE_OP_LOOK_BEHIND_NOT_START 1 +#define SIZE_OP_LOOK_BEHIND_NOT_END 1 +#define SIZE_OP_CALL 1 +#define SIZE_OP_RETURN 1 +#define SIZE_OP_PUSH_SAVE_VAL 1 +#define SIZE_OP_UPDATE_VAR 1 + +#ifdef USE_CALLOUT +#define SIZE_OP_CALLOUT_CONTENTS 1 +#define SIZE_OP_CALLOUT_NAME 1 +#endif +#endif /* if 0 */ + + #define MC_ESC(syn) (syn)->meta_char_table.esc #define MC_ANYCHAR(syn) (syn)->meta_char_table.anychar #define MC_ANYTIME(syn) (syn)->meta_char_table.anytime @@ -791,8 +805,186 @@ typedef int ModeType; #define NCCLASS_CLEAR_NOT(nd) NCCLASS_FLAG_CLEAR(nd, FLAG_NCCLASS_NOT) #define IS_NCCLASS_NOT(nd) IS_NCCLASS_FLAG_ON(nd, FLAG_NCCLASS_NOT) -extern void onig_add_end_call(void (*func)(void)); +typedef struct { +#ifdef USE_DIRECT_THREADED_CODE + const void* opaddr; +#else + enum OpCode opcode; +#endif + union { + struct { + UChar s[16]; /* Now used first 7 bytes only. */ + } exact; + struct { + UChar* s; + LengthType n; /* number of chars */ + } exact_n; /* EXACTN, EXACTN_IC, EXACTMB2N, EXACTMB3N */ + struct { + UChar* s; + LengthType n; /* number of chars */ + LengthType len; /* char byte length */ + } exact_len_n; /* EXACTMBN */ + struct { + BitSetRef bsp; + } cclass; + struct { + void* mb; + } cclass_mb; + struct { + void* mb; /* mb must be same position with cclass_mb for match_at(). */ + BitSetRef bsp; + } cclass_mix; + struct { + UChar c; + } anychar_star_peek_next; + struct { + ModeType mode; + } word_boundary; /* OP_WORD_BOUNDARY, OP_NO_WORD_BOUNDARY, OP_WORD_BEGIN, OP_WORD_END */ + struct { + enum TextSegmentBoundaryType type; + int not; + } text_segment_boundary; + struct { + union { + MemNumType n1; /* num == 1 */ + MemNumType* ns; /* num > 1 */ + }; + int num; + int nest_level; + } backref_general; /* BACKREF_MULTI, BACKREF_MULTI_IC, BACKREF_WITH_LEVEL, BACKREF_CHECK, BACKREF_CHECK_WITH_LEVEL, */ + struct { + MemNumType n1; + } backref_n; /* BACKREF_N, BACKREF_N_IC */ + struct { + MemNumType num; + } memory_start; /* MEMORY_START, MEMORY_START_PUSH */ + struct { + MemNumType num; + } memory_end; /* MEMORY_END, MEMORY_END_REC, MEMORY_END_PUSH, MEMORY_END_PUSH_REC */ + struct { + RelAddrType addr; + } jump; + struct { + RelAddrType addr; + } push; + struct { + RelAddrType addr; + UChar c; + } push_or_jump_exact1; + struct { + RelAddrType addr; + UChar c; + } push_if_peek_next; + struct { + MemNumType id; + RelAddrType addr; + } repeat; /* REPEAT, REPEAT_NG */ + struct { + MemNumType id; + } repeat_inc; /* REPEAT_INC, REPEAT_INC_SG, REPEAT_INC_NG, REPEAT_INC_NG_SG */ + struct { + MemNumType mem; + } empty_check_start; + struct { + MemNumType mem; + } empty_check_end; /* EMPTY_CHECK_END, EMPTY_CHECK_END_MEMST, EMPTY_CHECK_END_MEMST_PUSH */ + struct { + RelAddrType addr; + } prec_read_not_start; + struct { + LengthType len; + } look_behind; + struct { + LengthType len; + RelAddrType addr; + } look_behind_not_start; + struct { + AbsAddrType addr; + } call; + struct { + SaveType type; + MemNumType id; + } push_save_val; + struct { + UpdateVarType type; + MemNumType id; + } update_var; +#ifdef USE_CALLOUT + struct { + MemNumType num; + } callout_contents; + struct { + MemNumType num; + MemNumType id; + } callout_name; +#endif + }; +} Operation; + +typedef struct { + const UChar* pattern; + const UChar* pattern_end; +#ifdef USE_CALLOUT + void* tag_table; + int callout_num; + int callout_list_alloc; + CalloutListEntry* callout_list; /* index: callout num */ +#endif +} RegexExt; + +struct re_pattern_buffer { + /* common members of BBuf(bytes-buffer) */ + Operation* ops; +#ifdef USE_DIRECT_THREADED_CODE + enum OpCode* ocs; +#endif + Operation* ops_curr; + unsigned int ops_used; /* used space for ops */ + unsigned int ops_alloc; /* allocated space for ops */ + unsigned char* string_pool; + unsigned char* string_pool_end; + + int num_mem; /* used memory(...) num counted from 1 */ + int num_repeat; /* OP_REPEAT/OP_REPEAT_NG id-counter */ + int num_null_check; /* OP_EMPTY_CHECK_START/END id counter */ + int num_call; /* number of subexp call */ + unsigned int capture_history; /* (?@...) flag (1-31) */ + unsigned int bt_mem_start; /* need backtrack flag */ + unsigned int bt_mem_end; /* need backtrack flag */ + int stack_pop_level; + int repeat_range_alloc; + OnigRepeatRange* repeat_range; + + OnigEncoding enc; + OnigOptionType options; + OnigSyntaxType* syntax; + OnigCaseFoldType case_fold_flag; + void* name_table; + + /* optimization info (string search, char-map and anchors) */ + int optimize; /* optimize flag */ + int threshold_len; /* search str-length for apply optimize */ + int anchor; /* BEGIN_BUF, BEGIN_POS, (SEMI_)END_BUF */ + OnigLen anchor_dmin; /* (SEMI_)END_BUF anchor distance */ + OnigLen anchor_dmax; /* (SEMI_)END_BUF anchor distance */ + int sub_anchor; /* start-anchor for exact or map */ + unsigned char *exact; + unsigned char *exact_end; + unsigned char map[CHAR_MAP_SIZE]; /* used as BMH skip or char-map */ + int map_offset; + OnigLen dmin; /* min-distance of exact or map */ + OnigLen dmax; /* max-distance of exact or map */ + RegexExt* extp; +}; + +#define COP(reg) ((reg)->ops_curr) +#define COP_CURR_OFFSET(reg) ((reg)->ops_used - 1) +#define COP_CURR_OFFSET_BYTES(reg, p) \ + ((int )((char* )(&((reg)->ops_curr->p)) - (char* )((reg)->ops))) + + +extern void onig_add_end_call(void (*func)(void)); #ifdef ONIG_DEBUG @@ -809,9 +1001,7 @@ extern int onig_print_statistics P_((FILE* f)); extern void onig_warning(const char* s); extern UChar* onig_error_code_to_format P_((int code)); extern void onig_snprintf_with_pattern PV_((UChar buf[], int bufsize, OnigEncoding enc, UChar* pat, UChar* pat_end, const UChar *fmt, ...)); -extern int onig_bbuf_init P_((BBuf* buf, int size)); extern int onig_compile P_((regex_t* reg, const UChar* pattern, const UChar* pattern_end, OnigErrorInfo* einfo)); -extern void onig_transfer P_((regex_t* to, regex_t* from)); extern int onig_is_code_in_cc_len P_((int enclen, OnigCodePoint code, void* /* CClassNode* */ cc)); extern RegexExt* onig_get_regex_ext(regex_t* reg); extern int onig_ext_set_pattern(regex_t* reg, const UChar* pattern, const UChar* pattern_end); |