/********************************************************************** regparse.c - Oniguruma (regular expression library) **********************************************************************/ /*- * Copyright (c) 2002-2022 K.Kosako * All rights reserved. * * Redistribution and use in source and binary forms, with or without * modification, are permitted provided that the following conditions * are met: * 1. Redistributions of source code must retain the above copyright * notice, this list of conditions and the following disclaimer. * 2. Redistributions in binary form must reproduce the above copyright * notice, this list of conditions and the following disclaimer in the * documentation and/or other materials provided with the distribution. * * THIS SOFTWARE IS PROVIDED BY THE AUTHOR AND CONTRIBUTORS ``AS IS'' AND * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE * ARE DISCLAIMED. IN NO EVENT SHALL THE AUTHOR OR CONTRIBUTORS BE LIABLE * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF * SUCH DAMAGE. */ #ifdef DEBUG_NODE_FREE #ifndef NEED_TO_INCLUDE_STDIO #define NEED_TO_INCLUDE_STDIO #endif #endif #include "regparse.h" #include "st.h" #define INIT_TAG_NAMES_ALLOC_NUM 5 #define WARN_BUFSIZE 256 #define CASE_FOLD_IS_APPLIED_INSIDE_NEGATIVE_CCLASS #define IS_ALLOWED_CODE_IN_CALLOUT_NAME(c) \ ((c >= 'A' && c <= 'Z') || (c >= 'a' && c <= 'z') || (c >= '0' && c <= '9') || c == '_' /* || c == '!' */) #define IS_ALLOWED_CODE_IN_CALLOUT_TAG_NAME(c) \ ((c >= 'A' && c <= 'Z') || (c >= 'a' && c <= 'z') || (c >= '0' && c <= '9') || c == '_') #define OPTON_SINGLELINE(option) ((option) & ONIG_OPTION_SINGLELINE) #define OPTON_MULTILINE(option) ((option) & ONIG_OPTION_MULTILINE) #define OPTON_IGNORECASE(option) ((option) & ONIG_OPTION_IGNORECASE) #define OPTON_EXTEND(option) ((option) & ONIG_OPTION_EXTEND) #define OPTON_WORD_ASCII(option) \ ((option) & (ONIG_OPTION_WORD_IS_ASCII | ONIG_OPTION_POSIX_IS_ASCII)) #define OPTON_DIGIT_ASCII(option) \ ((option) & (ONIG_OPTION_DIGIT_IS_ASCII | ONIG_OPTION_POSIX_IS_ASCII)) #define OPTON_SPACE_ASCII(option) \ ((option) & (ONIG_OPTION_SPACE_IS_ASCII | ONIG_OPTION_POSIX_IS_ASCII)) #define OPTON_POSIX_ASCII(option) ((option) & ONIG_OPTION_POSIX_IS_ASCII) #define OPTON_TEXT_SEGMENT_WORD(option) ((option) & ONIG_OPTION_TEXT_SEGMENT_WORD) #define OPTON_IS_ASCII_MODE_CTYPE(ctype, options) \ ((ctype) >= 0 && \ (((ctype) < ONIGENC_CTYPE_ASCII && OPTON_POSIX_ASCII(options)) ||\ ((ctype) == ONIGENC_CTYPE_WORD && OPTON_WORD_ASCII(options)) ||\ ((ctype) == ONIGENC_CTYPE_DIGIT && OPTON_DIGIT_ASCII(options)) ||\ ((ctype) == ONIGENC_CTYPE_SPACE && OPTON_SPACE_ASCII(options)))) OnigSyntaxType OnigSyntaxOniguruma = { (( SYN_GNU_REGEX_OP | ONIG_SYN_OP_QMARK_NON_GREEDY | ONIG_SYN_OP_ESC_OCTAL3 | ONIG_SYN_OP_ESC_X_HEX2 | ONIG_SYN_OP_ESC_X_BRACE_HEX8 | ONIG_SYN_OP_ESC_O_BRACE_OCTAL | ONIG_SYN_OP_ESC_CONTROL_CHARS | ONIG_SYN_OP_ESC_C_CONTROL ) & ~ONIG_SYN_OP_ESC_LTGT_WORD_BEGIN_END ) , ( ONIG_SYN_OP2_QMARK_GROUP_EFFECT | ONIG_SYN_OP2_OPTION_ONIGURUMA | ONIG_SYN_OP2_QMARK_LT_NAMED_GROUP | ONIG_SYN_OP2_ESC_K_NAMED_BACKREF | ONIG_SYN_OP2_QMARK_LPAREN_IF_ELSE | ONIG_SYN_OP2_QMARK_TILDE_ABSENT_GROUP | ONIG_SYN_OP2_QMARK_BRACE_CALLOUT_CONTENTS | ONIG_SYN_OP2_ASTERISK_CALLOUT_NAME | ONIG_SYN_OP2_ESC_X_Y_TEXT_SEGMENT | ONIG_SYN_OP2_ESC_CAPITAL_R_GENERAL_NEWLINE | ONIG_SYN_OP2_ESC_CAPITAL_N_O_SUPER_DOT | ONIG_SYN_OP2_ESC_CAPITAL_K_KEEP | ONIG_SYN_OP2_ESC_G_SUBEXP_CALL | ONIG_SYN_OP2_ESC_P_BRACE_CHAR_PROPERTY | ONIG_SYN_OP2_ESC_P_BRACE_CIRCUMFLEX_NOT | ONIG_SYN_OP2_PLUS_POSSESSIVE_REPEAT | ONIG_SYN_OP2_CCLASS_SET_OP | ONIG_SYN_OP2_ESC_CAPITAL_C_BAR_CONTROL | ONIG_SYN_OP2_ESC_CAPITAL_M_BAR_META | ONIG_SYN_OP2_ESC_V_VTAB | ONIG_SYN_OP2_ESC_H_XDIGIT | ONIG_SYN_OP2_ESC_U_HEX4 ) , ( SYN_GNU_REGEX_BV | ONIG_SYN_ALLOW_INTERVAL_LOW_ABBREV | ONIG_SYN_DIFFERENT_LEN_ALT_LOOK_BEHIND | ONIG_SYN_VARIABLE_LEN_LOOK_BEHIND | ONIG_SYN_CAPTURE_ONLY_NAMED_GROUP | ONIG_SYN_ALLOW_MULTIPLEX_DEFINITION_NAME | ONIG_SYN_FIXED_INTERVAL_IS_GREEDY_ONLY | ONIG_SYN_ALLOW_INVALID_CODE_END_OF_RANGE_IN_CC | ONIG_SYN_WARN_CC_OP_NOT_ESCAPED | #ifdef USE_WHOLE_OPTIONS ONIG_SYN_WHOLE_OPTIONS | #endif ONIG_SYN_WARN_REDUNDANT_NESTED_REPEAT ) , ONIG_OPTION_NONE , { (OnigCodePoint )'\\' /* esc */ , (OnigCodePoint )ONIG_INEFFECTIVE_META_CHAR /* anychar '.' */ , (OnigCodePoint )ONIG_INEFFECTIVE_META_CHAR /* anytime '*' */ , (OnigCodePoint )ONIG_INEFFECTIVE_META_CHAR /* zero or one time '?' */ , (OnigCodePoint )ONIG_INEFFECTIVE_META_CHAR /* one or more time '+' */ , (OnigCodePoint )ONIG_INEFFECTIVE_META_CHAR /* anychar anytime */ } }; OnigSyntaxType OnigSyntaxRuby = { (( SYN_GNU_REGEX_OP | ONIG_SYN_OP_QMARK_NON_GREEDY | ONIG_SYN_OP_ESC_OCTAL3 | ONIG_SYN_OP_ESC_X_HEX2 | ONIG_SYN_OP_ESC_X_BRACE_HEX8 | ONIG_SYN_OP_ESC_O_BRACE_OCTAL | ONIG_SYN_OP_ESC_CONTROL_CHARS | ONIG_SYN_OP_ESC_C_CONTROL ) & ~ONIG_SYN_OP_ESC_LTGT_WORD_BEGIN_END ) , ( ONIG_SYN_OP2_QMARK_GROUP_EFFECT | ONIG_SYN_OP2_OPTION_RUBY | ONIG_SYN_OP2_QMARK_LT_NAMED_GROUP | ONIG_SYN_OP2_ESC_K_NAMED_BACKREF | ONIG_SYN_OP2_QMARK_LPAREN_IF_ELSE | ONIG_SYN_OP2_QMARK_TILDE_ABSENT_GROUP | ONIG_SYN_OP2_ESC_X_Y_TEXT_SEGMENT | ONIG_SYN_OP2_ESC_CAPITAL_R_GENERAL_NEWLINE | ONIG_SYN_OP2_ESC_CAPITAL_K_KEEP | ONIG_SYN_OP2_ESC_G_SUBEXP_CALL | ONIG_SYN_OP2_ESC_P_BRACE_CHAR_PROPERTY | ONIG_SYN_OP2_ESC_P_BRACE_CIRCUMFLEX_NOT | ONIG_SYN_OP2_PLUS_POSSESSIVE_REPEAT | ONIG_SYN_OP2_CCLASS_SET_OP | ONIG_SYN_OP2_ESC_CAPITAL_C_BAR_CONTROL | ONIG_SYN_OP2_ESC_CAPITAL_M_BAR_META | ONIG_SYN_OP2_ESC_V_VTAB | ONIG_SYN_OP2_ESC_H_XDIGIT | ONIG_SYN_OP2_ESC_U_HEX4 ) , ( SYN_GNU_REGEX_BV | ONIG_SYN_ALLOW_INTERVAL_LOW_ABBREV | ONIG_SYN_DIFFERENT_LEN_ALT_LOOK_BEHIND | ONIG_SYN_CAPTURE_ONLY_NAMED_GROUP | ONIG_SYN_ALLOW_MULTIPLEX_DEFINITION_NAME | ONIG_SYN_FIXED_INTERVAL_IS_GREEDY_ONLY | ONIG_SYN_WARN_CC_OP_NOT_ESCAPED | ONIG_SYN_WARN_REDUNDANT_NESTED_REPEAT ) , ONIG_OPTION_NONE , { (OnigCodePoint )'\\' /* esc */ , (OnigCodePoint )ONIG_INEFFECTIVE_META_CHAR /* anychar '.' */ , (OnigCodePoint )ONIG_INEFFECTIVE_META_CHAR /* anytime '*' */ , (OnigCodePoint )ONIG_INEFFECTIVE_META_CHAR /* zero or one time '?' */ , (OnigCodePoint )ONIG_INEFFECTIVE_META_CHAR /* one or more time '+' */ , (OnigCodePoint )ONIG_INEFFECTIVE_META_CHAR /* anychar anytime */ } }; OnigSyntaxType* OnigDefaultSyntax = ONIG_SYNTAX_ONIGURUMA; #define BB_INIT(buf,size) bbuf_init((BBuf* )(buf), (size)) #define BB_EXPAND(buf,low) do{\ do { (buf)->alloc *= 2; } while ((buf)->alloc < (unsigned int )low);\ (buf)->p = (UChar* )xrealloc((buf)->p, (buf)->alloc);\ if (IS_NULL((buf)->p)) return(ONIGERR_MEMORY);\ } while (0) #define BB_ENSURE_SIZE(buf,size) do{\ unsigned int new_alloc = (buf)->alloc;\ while (new_alloc < (unsigned int )(size)) { new_alloc *= 2; }\ if ((buf)->alloc != new_alloc) {\ (buf)->p = (UChar* )xrealloc((buf)->p, new_alloc);\ if (IS_NULL((buf)->p)) return(ONIGERR_MEMORY);\ (buf)->alloc = new_alloc;\ }\ } while (0) #define BB_WRITE(buf,pos,bytes,n) do{\ int used = (pos) + (n);\ if ((buf)->alloc < (unsigned int )used) BB_EXPAND((buf),used);\ xmemcpy((buf)->p + (pos), (bytes), (n));\ if ((buf)->used < (unsigned int )used) (buf)->used = used;\ } while (0) #define BB_WRITE1(buf,pos,byte) do{\ int used = (pos) + 1;\ if ((buf)->alloc < (unsigned int )used) BB_EXPAND((buf),used);\ (buf)->p[(pos)] = (byte);\ if ((buf)->used < (unsigned int )used) (buf)->used = used;\ } while (0) #define BB_ADD(buf,bytes,n) BB_WRITE((buf),(buf)->used,(bytes),(n)) #define BB_ADD1(buf,byte) BB_WRITE1((buf),(buf)->used,(byte)) #define BB_GET_ADD_ADDRESS(buf) ((buf)->p + (buf)->used) #define BB_GET_OFFSET_POS(buf) ((buf)->used) /* from < to */ #define BB_MOVE_RIGHT(buf,from,to,n) do {\ if ((unsigned int )((to)+(n)) > (buf)->alloc) BB_EXPAND((buf),(to) + (n));\ xmemmove((buf)->p + (to), (buf)->p + (from), (n));\ if ((unsigned int )((to)+(n)) > (buf)->used) (buf)->used = (to) + (n);\ } while (0) /* from > to */ #define BB_MOVE_LEFT(buf,from,to,n) do {\ xmemmove((buf)->p + (to), (buf)->p + (from), (n));\ } while (0) /* from > to */ #define BB_MOVE_LEFT_REDUCE(buf,from,to) do {\ xmemmove((buf)->p + (to), (buf)->p + (from), (buf)->used - (from));\ (buf)->used -= (from - to);\ } while (0) #define BB_INSERT(buf,pos,bytes,n) do {\ if (pos >= (buf)->used) {\ BB_WRITE(buf,pos,bytes,n);\ }\ else {\ BB_MOVE_RIGHT((buf),(pos),(pos) + (n),((buf)->used - (pos)));\ xmemcpy((buf)->p + (pos), (bytes), (n));\ }\ } while (0) #define BB_GET_BYTE(buf, pos) (buf)->p[(pos)] typedef enum { CS_VALUE, CS_RANGE, CS_COMPLETE, CS_START } CSTATE; typedef enum { CV_UNDEF, CV_SB, CV_MB, CV_CPROP } CVAL; extern void onig_null_warn(const char* s ARG_UNUSED) { } #ifdef DEFAULT_WARN_FUNCTION static OnigWarnFunc onig_warn = (OnigWarnFunc )DEFAULT_WARN_FUNCTION; #else static OnigWarnFunc onig_warn = onig_null_warn; #endif #ifdef DEFAULT_VERB_WARN_FUNCTION static OnigWarnFunc onig_verb_warn = (OnigWarnFunc )DEFAULT_VERB_WARN_FUNCTION; #else static OnigWarnFunc onig_verb_warn = onig_null_warn; #endif extern void onig_set_warn_func(OnigWarnFunc f) { onig_warn = f; } extern void onig_set_verb_warn_func(OnigWarnFunc f) { onig_verb_warn = f; } extern void onig_warning(const char* s) { if (onig_warn == onig_null_warn) return ; (*onig_warn)(s); } #define DEFAULT_MAX_CAPTURE_NUM 32767 static int MaxCaptureNum = DEFAULT_MAX_CAPTURE_NUM; extern int onig_set_capture_num_limit(int num) { if (num < 0) return -1; MaxCaptureNum = num; return 0; } static unsigned int ParseDepthLimit = DEFAULT_PARSE_DEPTH_LIMIT; extern unsigned int onig_get_parse_depth_limit(void) { return ParseDepthLimit; } extern int onig_set_parse_depth_limit(unsigned int depth) { if (depth == 0) ParseDepthLimit = DEFAULT_PARSE_DEPTH_LIMIT; else ParseDepthLimit = depth; return 0; } #ifdef ONIG_DEBUG_PARSE #define INC_PARSE_DEPTH(d) do {\ (d)++;\ if (env->max_parse_depth < (d)) env->max_parse_depth = d;\ if ((d) > ParseDepthLimit) \ return ONIGERR_PARSE_DEPTH_LIMIT_OVER;\ } while (0) #else #define INC_PARSE_DEPTH(d) do {\ (d)++;\ if ((d) > ParseDepthLimit) \ return ONIGERR_PARSE_DEPTH_LIMIT_OVER;\ } while (0) #endif #define DEC_PARSE_DEPTH(d) (d)-- static int bbuf_init(BBuf* buf, int size) { if (size <= 0) { size = 0; buf->p = NULL; } else { buf->p = (UChar* )xmalloc(size); if (IS_NULL(buf->p)) return(ONIGERR_MEMORY); } buf->alloc = size; buf->used = 0; return 0; } static void bbuf_free(BBuf* bbuf) { if (IS_NOT_NULL(bbuf)) { if (IS_NOT_NULL(bbuf->p)) xfree(bbuf->p); xfree(bbuf); } } static int bbuf_clone(BBuf** rto, BBuf* from) { int r; BBuf *to; *rto = to = (BBuf* )xmalloc(sizeof(BBuf)); CHECK_NULL_RETURN_MEMERR(to); r = BB_INIT(to, from->alloc); if (r != 0) { bbuf_free(to); *rto = 0; return r; } to->used = from->used; xmemcpy(to->p, from->p, from->used); return 0; } static int backref_rel_to_abs(int rel_no, ParseEnv* env) { if (rel_no > 0) { if (rel_no > ONIG_INT_MAX - env->num_mem) return ONIGERR_INVALID_BACKREF; return env->num_mem + rel_no; } else { return env->num_mem + 1 + rel_no; } } #define OPTION_ON(v,f) ((v) |= (f)) #define OPTION_OFF(v,f) ((v) &= ~(f)) #define OPTION_NEGATE(v,f,negative) (negative) ? ((v) &= ~(f)) : ((v) |= (f)) #define MBCODE_START_POS(enc) \ (OnigCodePoint )(ONIGENC_MBC_MINLEN(enc) > 1 ? 0 : 0x80) #define SET_ALL_MULTI_BYTE_RANGE(enc, pbuf) \ add_code_range_to_buf(pbuf, MBCODE_START_POS(enc), ~((OnigCodePoint )0)) #define ADD_ALL_MULTI_BYTE_RANGE(enc, mbuf) do {\ if (! ONIGENC_IS_SINGLEBYTE(enc)) {\ r = SET_ALL_MULTI_BYTE_RANGE(enc, &(mbuf));\ if (r != 0) return r;\ }\ } while (0) #define BITSET_IS_EMPTY(bs,empty) do {\ int i;\ empty = 1;\ for (i = 0; i < (int )BITSET_REAL_SIZE; i++) {\ if ((bs)[i] != 0) {\ empty = 0; break;\ }\ }\ } while (0) static void bitset_set_range(BitSetRef bs, int from, int to) { int i; for (i = from; i <= to && i < SINGLE_BYTE_SIZE; i++) { BITSET_SET_BIT(bs, i); } } static void bitset_invert(BitSetRef bs) { int i; for (i = 0; i < (int )BITSET_REAL_SIZE; i++) { bs[i] = ~(bs[i]); } } static void bitset_invert_to(BitSetRef from, BitSetRef to) { int i; for (i = 0; i < (int )BITSET_REAL_SIZE; i++) { to[i] = ~(from[i]); } } static void bitset_and(BitSetRef dest, BitSetRef bs) { int i; for (i = 0; i < (int )BITSET_REAL_SIZE; i++) { dest[i] &= bs[i]; } } static void bitset_or(BitSetRef dest, BitSetRef bs) { int i; for (i = 0; i < (int )BITSET_REAL_SIZE; i++) { dest[i] |= bs[i]; } } static void bitset_copy(BitSetRef dest, BitSetRef bs) { int i; for (i = 0; i < (int )BITSET_REAL_SIZE; i++) { dest[i] = bs[i]; } } extern int onig_strncmp(const UChar* s1, const UChar* s2, int n) { int x; while (n-- > 0) { x = *s2++ - *s1++; if (x) return x; } return 0; } extern void onig_strcpy(UChar* dest, const UChar* src, const UChar* end) { int len = (int )(end - src); if (len > 0) { xmemcpy(dest, src, len); dest[len] = (UChar )0; } } /* scan pattern methods */ #define PEND_VALUE 0 #define PFETCH_READY UChar* pfetch_prev #define PEND (p < end ? 0 : 1) #define PUNFETCH p = pfetch_prev #define PINC do { \ pfetch_prev = p; \ p += ONIGENC_MBC_ENC_LEN(enc, p); \ } while (0) #define PFETCH(c) do { \ c = ONIGENC_MBC_TO_CODE(enc, p, end); \ pfetch_prev = p; \ p += ONIGENC_MBC_ENC_LEN(enc, p); \ } while (0) #define PINC_S do { \ p += ONIGENC_MBC_ENC_LEN(enc, p); \ } while (0) #define PFETCH_S(c) do { \ c = ONIGENC_MBC_TO_CODE(enc, p, end); \ p += ONIGENC_MBC_ENC_LEN(enc, p); \ } while (0) #define PPEEK (p < end ? ONIGENC_MBC_TO_CODE(enc, p, end) : PEND_VALUE) #define PPEEK_IS(c) (PPEEK == (OnigCodePoint )c) static UChar* strcat_capa(UChar* dest, UChar* dest_end, const UChar* src, const UChar* src_end, int capa) { UChar* r; ptrdiff_t dest_delta = dest_end - dest; if (dest) r = (UChar* )xrealloc(dest, capa + 1); else r = (UChar* )xmalloc(capa + 1); CHECK_NULL_RETURN(r); onig_strcpy(r + dest_delta, src, src_end); return r; } /* dest on static area */ static UChar* strcat_capa_from_static(UChar* dest, UChar* dest_end, const UChar* src, const UChar* src_end, int capa) { UChar* r; r = (UChar* )xmalloc(capa + 1); CHECK_NULL_RETURN(r); onig_strcpy(r, dest, dest_end); onig_strcpy(r + (dest_end - dest), src, src_end); return r; } #ifdef USE_ST_LIBRARY typedef struct { UChar* s; UChar* end; } st_str_end_key; static int str_end_cmp(st_str_end_key* x, st_str_end_key* y) { UChar *p, *q; int c; if ((x->end - x->s) != (y->end - y->s)) return 1; p = x->s; q = y->s; while (p < x->end) { c = (int )*p - (int )*q; if (c != 0) return c; p++; q++; } return 0; } static int str_end_hash(st_str_end_key* x) { UChar *p; unsigned val = 0; p = x->s; while (p < x->end) { val = val * 997 + (unsigned )*p++; } return (int) (val + (val >> 5)); } extern hash_table_type onig_st_init_strend_table_with_size(int size) { static struct st_hash_type hashType = { str_end_cmp, str_end_hash, }; return (hash_table_type )onig_st_init_table_with_size(&hashType, size); } extern int onig_st_lookup_strend(hash_table_type table, const UChar* str_key, const UChar* end_key, hash_data_type *value) { st_str_end_key key; key.s = (UChar* )str_key; key.end = (UChar* )end_key; return onig_st_lookup(table, (st_data_t )(&key), value); } extern int onig_st_insert_strend(hash_table_type table, const UChar* str_key, const UChar* end_key, hash_data_type value) { st_str_end_key* key; int result; key = (st_str_end_key* )xmalloc(sizeof(st_str_end_key)); CHECK_NULL_RETURN_MEMERR(key); key->s = (UChar* )str_key; key->end = (UChar* )end_key; result = onig_st_insert(table, (st_data_t )key, value); if (result) { xfree(key); } return result; } #ifdef USE_CALLOUT typedef struct { OnigEncoding enc; int type; /* callout type: single or not */ UChar* s; UChar* end; } st_callout_name_key; static int callout_name_table_cmp(st_callout_name_key* x, st_callout_name_key* y) { UChar *p, *q; int c; if (x->enc != y->enc) return 1; if (x->type != y->type) return 1; if ((x->end - x->s) != (y->end - y->s)) return 1; p = x->s; q = y->s; while (p < x->end) { c = (int )*p - (int )*q; if (c != 0) return c; p++; q++; } return 0; } static int callout_name_table_hash(st_callout_name_key* x) { UChar *p; unsigned int val = 0; p = x->s; while (p < x->end) { val = val * 997 + (unsigned int )*p++; } /* use intptr_t for escape warning in Windows */ return (int )(val + (val >> 5) + ((intptr_t )x->enc & 0xffff) + x->type); } extern hash_table_type onig_st_init_callout_name_table_with_size(int size) { static struct st_hash_type hashType = { callout_name_table_cmp, callout_name_table_hash, }; return (hash_table_type )onig_st_init_table_with_size(&hashType, size); } extern int onig_st_lookup_callout_name_table(hash_table_type table, OnigEncoding enc, int type, const UChar* str_key, const UChar* end_key, hash_data_type *value) { st_callout_name_key key; key.enc = enc; key.type = type; key.s = (UChar* )str_key; key.end = (UChar* )end_key; return onig_st_lookup(table, (st_data_t )(&key), value); } static int st_insert_callout_name_table(hash_table_type table, OnigEncoding enc, int type, UChar* str_key, UChar* end_key, hash_data_type value) { st_callout_name_key* key; int result; key = (st_callout_name_key* )xmalloc(sizeof(st_callout_name_key)); CHECK_NULL_RETURN_MEMERR(key); /* key->s: don't duplicate, because str_key is duped in callout_name_entry() */ key->enc = enc; key->type = type; key->s = str_key; key->end = end_key; result = onig_st_insert(table, (st_data_t )key, value); if (result) { xfree(key); } return result; } #endif #endif /* USE_ST_LIBRARY */ #define INIT_NAME_BACKREFS_ALLOC_NUM 8 typedef struct { UChar* name; int name_len; /* byte length */ int back_num; /* number of backrefs */ int back_alloc; int back_ref1; int* back_refs; } NameEntry; #ifdef USE_ST_LIBRARY #define INIT_NAMES_ALLOC_NUM 5 typedef st_table NameTable; typedef st_data_t HashDataType; /* 1.6 st.h doesn't define st_data_t type */ #define NAMEBUF_SIZE 24 #define NAMEBUF_SIZE_1 25 #ifdef ONIG_DEBUG static int i_print_name_entry(UChar* key, NameEntry* e, void* arg) { int i; FILE* fp = (FILE* )arg; fprintf(fp, "%s: ", e->name); if (e->back_num == 0) fputs("-", fp); else if (e->back_num == 1) fprintf(fp, "%d", e->back_ref1); else { for (i = 0; i < e->back_num; i++) { if (i > 0) fprintf(fp, ", "); fprintf(fp, "%d", e->back_refs[i]); } } fputs("\n", fp); return ST_CONTINUE; } extern int onig_print_names(FILE* fp, regex_t* reg) { NameTable* t = (NameTable* )reg->name_table; if (IS_NOT_NULL(t)) { fprintf(fp, "name table\n"); onig_st_foreach(t, i_print_name_entry, (HashDataType )fp); fputs("\n", fp); } return 0; } #endif /* ONIG_DEBUG */ static int i_free_name_entry(UChar* key, NameEntry* e, void* arg ARG_UNUSED) { xfree(e->name); if (IS_NOT_NULL(e->back_refs)) xfree(e->back_refs); xfree(key); xfree(e); return ST_DELETE; } static int names_clear(regex_t* reg) { NameTable* t = (NameTable* )reg->name_table; if (IS_NOT_NULL(t)) { onig_st_foreach(t, i_free_name_entry, 0); } return 0; } extern int onig_names_free(regex_t* reg) { int r; NameTable* t; r = names_clear(reg); if (r != 0) return r; t = (NameTable* )reg->name_table; if (IS_NOT_NULL(t)) onig_st_free_table(t); reg->name_table = (void* )NULL; return 0; } static NameEntry* name_find(regex_t* reg, const UChar* name, const UChar* name_end) { NameEntry* e; NameTable* t = (NameTable* )reg->name_table; e = (NameEntry* )NULL; if (IS_NOT_NULL(t)) { onig_st_lookup_strend(t, name, name_end, (HashDataType* )((void* )(&e))); } return e; } typedef struct { int (*func)(const UChar*, const UChar*,int,int*,regex_t*,void*); regex_t* reg; void* arg; int ret; OnigEncoding enc; } INamesArg; static int i_names(UChar* key ARG_UNUSED, NameEntry* e, INamesArg* arg) { int r = (*(arg->func))(e->name, e->name + e->name_len, e->back_num, (e->back_num > 1 ? e->back_refs : &(e->back_ref1)), arg->reg, arg->arg); if (r != 0) { arg->ret = r; return ST_STOP; } return ST_CONTINUE; } extern int onig_foreach_name(regex_t* reg, int (*func)(const UChar*, const UChar*,int,int*,regex_t*,void*), void* arg) { INamesArg narg; NameTable* t = (NameTable* )reg->name_table; narg.ret = 0; if (IS_NOT_NULL(t)) { narg.func = func; narg.reg = reg; narg.arg = arg; narg.enc = reg->enc; /* should be pattern encoding. */ onig_st_foreach(t, i_names, (HashDataType )&narg); } return narg.ret; } static int i_renumber_name(UChar* key ARG_UNUSED, NameEntry* e, GroupNumMap* map) { int i; if (e->back_num > 1) { for (i = 0; i < e->back_num; i++) { e->back_refs[i] = map[e->back_refs[i]].new_val; } } else if (e->back_num == 1) { e->back_ref1 = map[e->back_ref1].new_val; } return ST_CONTINUE; } extern int onig_renumber_name_table(regex_t* reg, GroupNumMap* map) { NameTable* t = (NameTable* )reg->name_table; if (IS_NOT_NULL(t)) { onig_st_foreach(t, i_renumber_name, (HashDataType )map); } return 0; } extern int onig_number_of_names(regex_t* reg) { NameTable* t = (NameTable* )reg->name_table; if (IS_NOT_NULL(t)) return t->num_entries; else return 0; } #else /* USE_ST_LIBRARY */ #define INIT_NAMES_ALLOC_NUM 8 typedef struct { NameEntry* e; int num; int alloc; } NameTable; #ifdef ONIG_DEBUG extern int onig_print_names(FILE* fp, regex_t* reg) { int i, j; NameEntry* e; NameTable* t = (NameTable* )reg->name_table; if (IS_NOT_NULL(t) && t->num > 0) { fprintf(fp, "name table\n"); for (i = 0; i < t->num; i++) { e = &(t->e[i]); fprintf(fp, "%s: ", e->name); if (e->back_num == 0) { fputs("-", fp); } else if (e->back_num == 1) { fprintf(fp, "%d", e->back_ref1); } else { for (j = 0; j < e->back_num; j++) { if (j > 0) fprintf(fp, ", "); fprintf(fp, "%d", e->back_refs[j]); } } fputs("\n", fp); } fputs("\n", fp); } return 0; } #endif static int names_clear(regex_t* reg) { int i; NameEntry* e; NameTable* t = (NameTable* )reg->name_table; if (IS_NOT_NULL(t)) { for (i = 0; i < t->num; i++) { e = &(t->e[i]); if (IS_NOT_NULL(e->name)) { xfree(e->name); e->name = NULL; e->name_len = 0; e->back_num = 0; e->back_alloc = 0; if (IS_NOT_NULL(e->back_refs)) xfree(e->back_refs); e->back_refs = (int* )NULL; } } if (IS_NOT_NULL(t->e)) { xfree(t->e); t->e = NULL; } t->num = 0; } return 0; } extern int onig_names_free(regex_t* reg) { int r; NameTable* t; r = names_clear(reg); if (r != 0) return r; t = (NameTable* )reg->name_table; if (IS_NOT_NULL(t)) xfree(t); reg->name_table = NULL; return 0; } static NameEntry* name_find(regex_t* reg, UChar* name, UChar* name_end) { int i, len; NameEntry* e; NameTable* t = (NameTable* )reg->name_table; if (IS_NOT_NULL(t)) { len = name_end - name; for (i = 0; i < t->num; i++) { e = &(t->e[i]); if (len == e->name_len && onig_strncmp(name, e->name, len) == 0) return e; } } return (NameEntry* )NULL; } extern int onig_foreach_name(regex_t* reg, int (*func)(const UChar*, const UChar*,int,int*,regex_t*,void*), void* arg) { int i, r; NameEntry* e; NameTable* t = (NameTable* )reg->name_table; if (IS_NOT_NULL(t)) { for (i = 0; i < t->num; i++) { e = &(t->e[i]); r = (*func)(e->name, e->name + e->name_len, e->back_num, (e->back_num > 1 ? e->back_refs : &(e->back_ref1)), reg, arg); if (r != 0) return r; } } return 0; } extern int onig_number_of_names(regex_t* reg) { NameTable* t = (NameTable* )reg->name_table; if (IS_NOT_NULL(t)) return t->num; else return 0; } #endif /* else USE_ST_LIBRARY */ static int name_add(regex_t* reg, UChar* name, UChar* name_end, int backref, ParseEnv* env) { int r; int alloc; NameEntry* e; NameTable* t = (NameTable* )reg->name_table; if (name_end - name <= 0) return ONIGERR_EMPTY_GROUP_NAME; e = name_find(reg, name, name_end); if (IS_NULL(e)) { #ifdef USE_ST_LIBRARY if (IS_NULL(t)) { t = onig_st_init_strend_table_with_size(INIT_NAMES_ALLOC_NUM); CHECK_NULL_RETURN_MEMERR(t); reg->name_table = (void* )t; } e = (NameEntry* )xmalloc(sizeof(NameEntry)); CHECK_NULL_RETURN_MEMERR(e); e->name = onigenc_strdup(reg->enc, name, name_end); if (IS_NULL(e->name)) { xfree(e); return ONIGERR_MEMORY; } r = onig_st_insert_strend(t, e->name, (e->name + (name_end - name)), (HashDataType )e); if (r < 0) return r; e->name_len = (int )(name_end - name); e->back_num = 0; e->back_alloc = 0; e->back_refs = (int* )NULL; #else if (IS_NULL(t)) { alloc = INIT_NAMES_ALLOC_NUM; t = (NameTable* )xmalloc(sizeof(NameTable)); CHECK_NULL_RETURN_MEMERR(t); t->e = NULL; t->alloc = 0; t->num = 0; t->e = (NameEntry* )xmalloc(sizeof(NameEntry) * alloc); if (IS_NULL(t->e)) { xfree(t); return ONIGERR_MEMORY; } t->alloc = alloc; reg->name_table = t; goto clear; } else if (t->num == t->alloc) { int i; alloc = t->alloc * 2; t->e = (NameEntry* )xrealloc(t->e, sizeof(NameEntry) * alloc); CHECK_NULL_RETURN_MEMERR(t->e); t->alloc = alloc; clear: for (i = t->num; i < t->alloc; i++) { t->e[i].name = NULL; t->e[i].name_len = 0; t->e[i].back_num = 0; t->e[i].back_alloc = 0; t->e[i].back_refs = (int* )NULL; } } e = &(t->e[t->num]); t->num++; e->name = onigenc_strdup(reg->enc, name, name_end); if (IS_NULL(e->name)) return ONIGERR_MEMORY; e->name_len = name_end - name; #endif } if (e->back_num >= 1 && ! IS_SYNTAX_BV(env->syntax, ONIG_SYN_ALLOW_MULTIPLEX_DEFINITION_NAME)) { onig_scan_env_set_error_string(env, ONIGERR_MULTIPLEX_DEFINED_NAME, name, name_end); return ONIGERR_MULTIPLEX_DEFINED_NAME; } e->back_num++; if (e->back_num == 1) { e->back_ref1 = backref; } else { if (e->back_num == 2) { alloc = INIT_NAME_BACKREFS_ALLOC_NUM; e->back_refs = (int* )xmalloc(sizeof(int) * alloc); CHECK_NULL_RETURN_MEMERR(e->back_refs); e->back_alloc = alloc; e->back_refs[0] = e->back_ref1; e->back_refs[1] = backref; } else { if (e->back_num > e->back_alloc) { alloc = e->back_alloc * 2; e->back_refs = (int* )xrealloc(e->back_refs, sizeof(int) * alloc); CHECK_NULL_RETURN_MEMERR(e->back_refs); e->back_alloc = alloc; } e->back_refs[e->back_num - 1] = backref; } } return 0; } extern int onig_name_to_group_numbers(regex_t* reg, const UChar* name, const UChar* name_end, int** nums) { NameEntry* e = name_find(reg, name, name_end); if (IS_NULL(e)) return ONIGERR_UNDEFINED_NAME_REFERENCE; switch (e->back_num) { case 0: break; case 1: *nums = &(e->back_ref1); break; default: *nums = e->back_refs; break; } return e->back_num; } static int name_to_group_numbers(ParseEnv* env, const UChar* name, const UChar* name_end, int** nums) { regex_t* reg; NameEntry* e; reg = env->reg; e = name_find(reg, name, name_end); if (IS_NULL(e)) { onig_scan_env_set_error_string(env, ONIGERR_UNDEFINED_NAME_REFERENCE, (UChar* )name, (UChar* )name_end); return ONIGERR_UNDEFINED_NAME_REFERENCE; } switch (e->back_num) { case 0: break; case 1: *nums = &(e->back_ref1); break; default: *nums = e->back_refs; break; } return e->back_num; } extern int onig_name_to_backref_number(regex_t* reg, const UChar* name, const UChar* name_end, OnigRegion *region) { int i, n, *nums; n = onig_name_to_group_numbers(reg, name, name_end, &nums); if (n < 0) return n; else if (n == 0) return ONIGERR_PARSER_BUG; else if (n == 1) return nums[0]; else { if (IS_NOT_NULL(region)) { for (i = n - 1; i >= 0; i--) { if (region->beg[nums[i]] != ONIG_REGION_NOTPOS) return nums[i]; } } return nums[n - 1]; } } extern int onig_noname_group_capture_is_active(regex_t* reg) { if (OPTON_DONT_CAPTURE_GROUP(reg->options)) return 0; if (onig_number_of_names(reg) > 0 && IS_SYNTAX_BV(reg->syntax, ONIG_SYN_CAPTURE_ONLY_NAMED_GROUP) && ! OPTON_CAPTURE_GROUP(reg->options)) { return 0; } return 1; } #ifdef USE_CALLOUT typedef struct { OnigCalloutType type; int in; OnigCalloutFunc start_func; OnigCalloutFunc end_func; int arg_num; int opt_arg_num; unsigned int arg_types[ONIG_CALLOUT_MAX_ARGS_NUM]; OnigValue opt_defaults[ONIG_CALLOUT_MAX_ARGS_NUM]; UChar* name; /* reference to GlobalCalloutNameTable entry: e->name */ } CalloutNameListEntry; typedef struct { int n; int alloc; CalloutNameListEntry* v; } CalloutNameListType; static CalloutNameListType* GlobalCalloutNameList; static int make_callout_func_list(CalloutNameListType** rs, int init_size) { CalloutNameListType* s; CalloutNameListEntry* v; *rs = 0; s = xmalloc(sizeof(*s)); if (IS_NULL(s)) return ONIGERR_MEMORY; v = (CalloutNameListEntry* )xmalloc(sizeof(CalloutNameListEntry) * init_size); if (IS_NULL(v)) { xfree(s); return ONIGERR_MEMORY; } s->n = 0; s->alloc = init_size; s->v = v; *rs = s; return ONIG_NORMAL; } static void free_callout_func_list(CalloutNameListType* s) { if (IS_NOT_NULL(s)) { if (IS_NOT_NULL(s->v)) { int i, j; for (i = 0; i < s->n; i++) { CalloutNameListEntry* e = s->v + i; for (j = e->arg_num - e->opt_arg_num; j < e->arg_num; j++) { if (e->arg_types[j] == ONIG_TYPE_STRING) { UChar* p = e->opt_defaults[j].s.start; if (IS_NOT_NULL(p)) xfree(p); } } } xfree(s->v); } xfree(s); } } static int callout_func_list_add(CalloutNameListType* s, int* rid) { if (s->n >= s->alloc) { int new_size = s->alloc * 2; CalloutNameListEntry* nv = (CalloutNameListEntry* ) xrealloc(s->v, sizeof(CalloutNameListEntry) * new_size); if (IS_NULL(nv)) return ONIGERR_MEMORY; s->alloc = new_size; s->v = nv; } *rid = s->n; xmemset(&(s->v[s->n]), 0, sizeof(*(s->v))); s->n++; return ONIG_NORMAL; } typedef struct { UChar* name; int name_len; /* byte length */ int id; } CalloutNameEntry; #ifdef USE_ST_LIBRARY typedef st_table CalloutNameTable; #else typedef struct { CalloutNameEntry* e; int num; int alloc; } CalloutNameTable; #endif static CalloutNameTable* GlobalCalloutNameTable; static int CalloutNameIDCounter; #ifdef USE_ST_LIBRARY static int i_free_callout_name_entry(st_callout_name_key* key, CalloutNameEntry* e, void* arg ARG_UNUSED) { if (IS_NOT_NULL(e)) { xfree(e->name); } /*xfree(key->s); */ /* is same as e->name */ xfree(key); xfree(e); return ST_DELETE; } static int callout_name_table_clear(CalloutNameTable* t) { if (IS_NOT_NULL(t)) { onig_st_foreach(t, i_free_callout_name_entry, 0); } return 0; } static int global_callout_name_table_free(void) { if (IS_NOT_NULL(GlobalCalloutNameTable)) { int r = callout_name_table_clear(GlobalCalloutNameTable); if (r != 0) return r; onig_st_free_table(GlobalCalloutNameTable); GlobalCalloutNameTable = 0; CalloutNameIDCounter = 0; } return 0; } static CalloutNameEntry* callout_name_find(OnigEncoding enc, int is_not_single, const UChar* name, const UChar* name_end) { int r; CalloutNameEntry* e; CalloutNameTable* t = GlobalCalloutNameTable; e = (CalloutNameEntry* )NULL; if (IS_NOT_NULL(t)) { r = onig_st_lookup_callout_name_table(t, enc, is_not_single, name, name_end, (HashDataType* )((void* )(&e))); if (r == 0) { /* not found */ if (enc != ONIG_ENCODING_ASCII && ONIGENC_IS_ASCII_COMPATIBLE_ENCODING(enc)) { enc = ONIG_ENCODING_ASCII; onig_st_lookup_callout_name_table(t, enc, is_not_single, name, name_end, (HashDataType* )((void* )(&e))); } } } return e; } #else static int callout_name_table_clear(CalloutNameTable* t) { int i; CalloutNameEntry* e; if (IS_NOT_NULL(t)) { for (i = 0; i < t->num; i++) { e = &(t->e[i]); if (IS_NOT_NULL(e->name)) { xfree(e->name); e->name = NULL; e->name_len = 0; e->id = 0; e->func = 0; } } if (IS_NOT_NULL(t->e)) { xfree(t->e); t->e = NULL; } t->num = 0; } return 0; } static int global_callout_name_table_free(void) { if (IS_NOT_NULL(GlobalCalloutNameTable)) { int r = callout_name_table_clear(GlobalCalloutNameTable); if (r != 0) return r; xfree(GlobalCalloutNameTable); GlobalCalloutNameTable = 0; CalloutNameIDCounter = 0; } return 0; } static CalloutNameEntry* callout_name_find(UChar* name, UChar* name_end) { int i, len; CalloutNameEntry* e; CalloutNameTable* t = Calloutnames; if (IS_NOT_NULL(t)) { len = name_end - name; for (i = 0; i < t->num; i++) { e = &(t->e[i]); if (len == e->name_len && onig_strncmp(name, e->name, len) == 0) return e; } } return (CalloutNameEntry* )NULL; } #endif /* name string must be single byte char string. */ static int callout_name_entry(CalloutNameEntry** rentry, OnigEncoding enc, int is_not_single, UChar* name, UChar* name_end) { int r; CalloutNameEntry* e; CalloutNameTable* t = GlobalCalloutNameTable; *rentry = 0; if (name_end - name <= 0) return ONIGERR_INVALID_CALLOUT_NAME; e = callout_name_find(enc, is_not_single, name, name_end); if (IS_NULL(e)) { #ifdef USE_ST_LIBRARY if (IS_NULL(t)) { t = onig_st_init_callout_name_table_with_size(INIT_NAMES_ALLOC_NUM); CHECK_NULL_RETURN_MEMERR(t); GlobalCalloutNameTable = t; } e = (CalloutNameEntry* )xmalloc(sizeof(CalloutNameEntry)); CHECK_NULL_RETURN_MEMERR(e); e->name = onigenc_strdup(enc, name, name_end); if (IS_NULL(e->name)) { xfree(e); return ONIGERR_MEMORY; } r = st_insert_callout_name_table(t, enc, is_not_single, e->name, (e->name + (name_end - name)), (HashDataType )e); if (r < 0) return r; #else int alloc; if (IS_NULL(t)) { alloc = INIT_NAMES_ALLOC_NUM; t = (CalloutNameTable* )xmalloc(sizeof(CalloutNameTable)); CHECK_NULL_RETURN_MEMERR(t); t->e = NULL; t->alloc = 0; t->num = 0; t->e = (CalloutNameEntry* )xmalloc(sizeof(CalloutNameEntry) * alloc); if (IS_NULL(t->e)) { xfree(t); return ONIGERR_MEMORY; } t->alloc = alloc; GlobalCalloutNameTable = t; goto clear; } else if (t->num == t->alloc) { int i; alloc = t->alloc * 2; t->e = (CalloutNameEntry* )xrealloc(t->e, sizeof(CalloutNameEntry) * alloc); CHECK_NULL_RETURN_MEMERR(t->e); t->alloc = alloc; clear: for (i = t->num; i < t->alloc; i++) { t->e[i].name = NULL; t->e[i].name_len = 0; t->e[i].id = 0; } } e = &(t->e[t->num]); t->num++; e->name = onigenc_strdup(enc, name, name_end); if (IS_NULL(e->name)) return ONIGERR_MEMORY; #endif CalloutNameIDCounter++; e->id = CalloutNameIDCounter; e->name_len = (int )(name_end - name); } *rentry = e; return e->id; } static int is_allowed_callout_name(OnigEncoding enc, UChar* name, UChar* name_end) { UChar* p; OnigCodePoint c; if (name >= name_end) return 0; p = name; while (p < name_end) { c = ONIGENC_MBC_TO_CODE(enc, p, name_end); if (! IS_ALLOWED_CODE_IN_CALLOUT_NAME(c)) return 0; if (p == name) { if (c >= '0' && c <= '9') return 0; } p += ONIGENC_MBC_ENC_LEN(enc, p); } return 1; } static int is_allowed_callout_tag_name(OnigEncoding enc, UChar* name, UChar* name_end) { UChar* p; OnigCodePoint c; if (name >= name_end) return 0; p = name; while (p < name_end) { c = ONIGENC_MBC_TO_CODE(enc, p, name_end); if (! IS_ALLOWED_CODE_IN_CALLOUT_TAG_NAME(c)) return 0; if (p == name) { if (c >= '0' && c <= '9') return 0; } p += ONIGENC_MBC_ENC_LEN(enc, p); } return 1; } extern int onig_set_callout_of_name(OnigEncoding enc, OnigCalloutType callout_type, UChar* name, UChar* name_end, int in, OnigCalloutFunc start_func, OnigCalloutFunc end_func, int arg_num, unsigned int arg_types[], int opt_arg_num, OnigValue opt_defaults[]) { int r; int i; int j; int id; int is_not_single; CalloutNameEntry* e; CalloutNameListEntry* fe; if (callout_type != ONIG_CALLOUT_TYPE_SINGLE) return ONIGERR_INVALID_ARGUMENT; if (arg_num < 0 || arg_num > ONIG_CALLOUT_MAX_ARGS_NUM) return ONIGERR_INVALID_CALLOUT_ARG; if (opt_arg_num < 0 || opt_arg_num > arg_num) return ONIGERR_INVALID_CALLOUT_ARG; if (start_func == 0 && end_func == 0) return ONIGERR_INVALID_CALLOUT_ARG; if ((in & ONIG_CALLOUT_IN_PROGRESS) == 0 && (in & ONIG_CALLOUT_IN_RETRACTION) == 0) return ONIGERR_INVALID_CALLOUT_ARG; for (i = 0; i < arg_num; i++) { unsigned int t = arg_types[i]; if (t == ONIG_TYPE_VOID) return ONIGERR_INVALID_CALLOUT_ARG; else { if (i >= arg_num - opt_arg_num) { if (t != ONIG_TYPE_LONG && t != ONIG_TYPE_CHAR && t != ONIG_TYPE_STRING && t != ONIG_TYPE_TAG) return ONIGERR_INVALID_CALLOUT_ARG; } else { if (t != ONIG_TYPE_LONG) { t = t & ~ONIG_TYPE_LONG; if (t != ONIG_TYPE_CHAR && t != ONIG_TYPE_STRING && t != ONIG_TYPE_TAG) return ONIGERR_INVALID_CALLOUT_ARG; } } } } if (! is_allowed_callout_name(enc, name, name_end)) { return ONIGERR_INVALID_CALLOUT_NAME; } is_not_single = (callout_type != ONIG_CALLOUT_TYPE_SINGLE); id = callout_name_entry(&e, enc, is_not_single, name, name_end); if (id < 0) return id; r = ONIG_NORMAL; if (IS_NULL(GlobalCalloutNameList)) { r = make_callout_func_list(&GlobalCalloutNameList, 10); if (r != ONIG_NORMAL) return r; } while (id >= GlobalCalloutNameList->n) { int rid; r = callout_func_list_add(GlobalCalloutNameList, &rid); if (r != ONIG_NORMAL) return r; } fe = GlobalCalloutNameList->v + id; fe->type = callout_type; fe->in = in; fe->start_func = start_func; fe->end_func = end_func; fe->arg_num = arg_num; fe->opt_arg_num = opt_arg_num; fe->name = e->name; for (i = 0; i < arg_num; i++) { fe->arg_types[i] = arg_types[i]; } for (i = arg_num - opt_arg_num, j = 0; i < arg_num; i++, j++) { if (IS_NULL(opt_defaults)) return ONIGERR_INVALID_ARGUMENT; if (fe->arg_types[i] == ONIG_TYPE_STRING) { OnigValue* val; UChar* ds; val = opt_defaults + j; ds = onigenc_strdup(enc, val->s.start, val->s.end); CHECK_NULL_RETURN_MEMERR(ds); fe->opt_defaults[i].s.start = ds; fe->opt_defaults[i].s.end = ds + (val->s.end - val->s.start); } else { fe->opt_defaults[i] = opt_defaults[j]; } } r = id; return r; } static int get_callout_name_id_by_name(OnigEncoding enc, int is_not_single, UChar* name, UChar* name_end, int* rid) { int r; CalloutNameEntry* e; if (! is_allowed_callout_name(enc, name, name_end)) { return ONIGERR_INVALID_CALLOUT_NAME; } e = callout_name_find(enc, is_not_single, name, name_end); if (IS_NULL(e)) { return ONIGERR_UNDEFINED_CALLOUT_NAME; } r = ONIG_NORMAL; *rid = e->id; return r; } extern OnigCalloutFunc onig_get_callout_start_func(regex_t* reg, int callout_num) { /* If used for callouts of contents, return 0. */ CalloutListEntry* e; e = onig_reg_callout_list_at(reg, callout_num); CHECK_NULL_RETURN(e); return e->start_func; } extern const UChar* onig_get_callout_tag_start(regex_t* reg, int callout_num) { CalloutListEntry* e = onig_reg_callout_list_at(reg, callout_num); CHECK_NULL_RETURN(e); return e->tag_start; } extern const UChar* onig_get_callout_tag_end(regex_t* reg, int callout_num) { CalloutListEntry* e = onig_reg_callout_list_at(reg, callout_num); CHECK_NULL_RETURN(e); return e->tag_end; } extern OnigCalloutType onig_get_callout_type_by_name_id(int name_id) { if (name_id < 0 || name_id >= GlobalCalloutNameList->n) return 0; return GlobalCalloutNameList->v[name_id].type; } extern OnigCalloutFunc onig_get_callout_start_func_by_name_id(int name_id) { if (name_id < 0 || name_id >= GlobalCalloutNameList->n) return 0; return GlobalCalloutNameList->v[name_id].start_func; } extern OnigCalloutFunc onig_get_callout_end_func_by_name_id(int name_id) { if (name_id < 0 || name_id >= GlobalCalloutNameList->n) return 0; return GlobalCalloutNameList->v[name_id].end_func; } extern int onig_get_callout_in_by_name_id(int name_id) { if (name_id < 0 || name_id >= GlobalCalloutNameList->n) return 0; return GlobalCalloutNameList->v[name_id].in; } static int get_callout_arg_num_by_name_id(int name_id) { return GlobalCalloutNameList->v[name_id].arg_num; } static int get_callout_opt_arg_num_by_name_id(int name_id) { return GlobalCalloutNameList->v[name_id].opt_arg_num; } static unsigned int get_callout_arg_type_by_name_id(int name_id, int index) { return GlobalCalloutNameList->v[name_id].arg_types[index]; } static OnigValue get_callout_opt_default_by_name_id(int name_id, int index) { return GlobalCalloutNameList->v[name_id].opt_defaults[index]; } extern UChar* onig_get_callout_name_by_name_id(int name_id) { if (name_id < 0 || name_id >= GlobalCalloutNameList->n) return 0; return GlobalCalloutNameList->v[name_id].name; } extern int onig_global_callout_names_free(void) { free_callout_func_list(GlobalCalloutNameList); GlobalCalloutNameList = 0; global_callout_name_table_free(); return ONIG_NORMAL; } typedef st_table CalloutTagTable; typedef intptr_t CalloutTagVal; #define CALLOUT_TAG_LIST_FLAG_TAG_EXIST (1<<0) static int i_callout_callout_list_set(UChar* key, CalloutTagVal e, void* arg) { int num; RegexExt* ext = (RegexExt* )arg; num = (int )e - 1; ext->callout_list[num].flag |= CALLOUT_TAG_LIST_FLAG_TAG_EXIST; return ST_CONTINUE; } static int setup_ext_callout_list_values(regex_t* reg) { int i, j; RegexExt* ext; ext = reg->extp; if (IS_NOT_NULL(ext->tag_table)) { onig_st_foreach((CalloutTagTable *)ext->tag_table, i_callout_callout_list_set, (st_data_t )ext); } for (i = 0; i < ext->callout_num; i++) { CalloutListEntry* e = ext->callout_list + i; if (e->of == ONIG_CALLOUT_OF_NAME) { for (j = 0; j < e->u.arg.num; j++) { if (e->u.arg.types[j] == ONIG_TYPE_TAG) { UChar* start; UChar* end; int num; start = e->u.arg.vals[j].s.start; end = e->u.arg.vals[j].s.end; num = onig_get_callout_num_by_tag(reg, start, end); if (num < 0) return num; e->u.arg.vals[j].tag = num; } } } } return ONIG_NORMAL; } extern int onig_callout_tag_is_exist_at_callout_num(regex_t* reg, int callout_num) { RegexExt* ext = reg->extp; if (IS_NULL(ext) || IS_NULL(ext->callout_list)) return 0; if (callout_num > ext->callout_num) return 0; return (ext->callout_list[callout_num].flag & CALLOUT_TAG_LIST_FLAG_TAG_EXIST) != 0; } static int i_free_callout_tag_entry(UChar* key, CalloutTagVal e, void* arg ARG_UNUSED) { xfree(key); return ST_DELETE; } static int callout_tag_table_clear(CalloutTagTable* t) { if (IS_NOT_NULL(t)) { onig_st_foreach(t, i_free_callout_tag_entry, 0); } return 0; } extern int onig_callout_tag_table_free(void* table) { CalloutTagTable* t = (CalloutTagTable* )table; if (IS_NOT_NULL(t)) { int r = callout_tag_table_clear(t); if (r != 0) return r; onig_st_free_table(t); } return 0; } extern int onig_get_callout_num_by_tag(regex_t* reg, const UChar* tag, const UChar* tag_end) { int r; RegexExt* ext; CalloutTagVal e; ext = reg->extp; if (IS_NULL(ext) || IS_NULL(ext->tag_table)) return ONIGERR_INVALID_CALLOUT_TAG_NAME; r = onig_st_lookup_strend(ext->tag_table, tag, tag_end, (HashDataType* )((void* )(&e))); if (r == 0) return ONIGERR_INVALID_CALLOUT_TAG_NAME; return (int )e; } static CalloutTagVal callout_tag_find(CalloutTagTable* t, const UChar* name, const UChar* name_end) { CalloutTagVal e; e = -1; if (IS_NOT_NULL(t)) { onig_st_lookup_strend(t, name, name_end, (HashDataType* )((void* )(&e))); } return e; } static int callout_tag_table_new(CalloutTagTable** rt) { CalloutTagTable* t; *rt = 0; t = onig_st_init_strend_table_with_size(INIT_TAG_NAMES_ALLOC_NUM); CHECK_NULL_RETURN_MEMERR(t); *rt = t; return ONIG_NORMAL; } static int callout_tag_entry_raw(ParseEnv* env, CalloutTagTable* t, UChar* name, UChar* name_end, CalloutTagVal entry_val) { int r; CalloutTagVal val; if (name_end - name <= 0) return ONIGERR_INVALID_CALLOUT_TAG_NAME; val = callout_tag_find(t, name, name_end); if (val >= 0) { onig_scan_env_set_error_string(env, ONIGERR_MULTIPLEX_DEFINED_NAME, name, name_end); return ONIGERR_MULTIPLEX_DEFINED_NAME; } r = onig_st_insert_strend(t, name, name_end, (HashDataType )entry_val); if (r < 0) return r; return ONIG_NORMAL; } static int ext_ensure_tag_table(regex_t* reg) { int r; RegexExt* ext; CalloutTagTable* t; ext = onig_get_regex_ext(reg); CHECK_NULL_RETURN_MEMERR(ext); if (IS_NULL(ext->tag_table)) { r = callout_tag_table_new(&t); if (r != ONIG_NORMAL) return r; ext->tag_table = t; } return ONIG_NORMAL; } static int callout_tag_entry(ParseEnv* env, regex_t* reg, UChar* name, UChar* name_end, CalloutTagVal entry_val) { int r; RegexExt* ext; CalloutListEntry* e; r = ext_ensure_tag_table(reg); if (r != ONIG_NORMAL) return r; ext = onig_get_regex_ext(reg); CHECK_NULL_RETURN_MEMERR(ext); r = callout_tag_entry_raw(env, ext->tag_table, name, name_end, entry_val); e = onig_reg_callout_list_at(reg, (int )entry_val); CHECK_NULL_RETURN_MEMERR(e); e->tag_start = name; e->tag_end = name_end; return r; } #endif /* USE_CALLOUT */ #define INIT_PARSEENV_MEMENV_ALLOC_SIZE 16 static void scan_env_clear(ParseEnv* env) { MEM_STATUS_CLEAR(env->cap_history); MEM_STATUS_CLEAR(env->backtrack_mem); MEM_STATUS_CLEAR(env->backrefed_mem); env->error = (UChar* )NULL; env->error_end = (UChar* )NULL; env->num_call = 0; #ifdef USE_CALL env->unset_addr_list = NULL; #endif env->num_mem = 0; env->num_named = 0; env->mem_alloc = 0; env->mem_env_dynamic = (MemEnv* )NULL; xmemset(env->mem_env_static, 0, sizeof(env->mem_env_static)); env->parse_depth = 0; #ifdef ONIG_DEBUG_PARSE env->max_parse_depth = 0; #endif env->backref_num = 0; env->keep_num = 0; env->id_num = 0; env->save_alloc_num = 0; env->saves = 0; env->flags = 0; } static int scan_env_add_mem_entry(ParseEnv* env) { int i, need, alloc; MemEnv* p; need = env->num_mem + 1; if (need > MaxCaptureNum && MaxCaptureNum != 0) return ONIGERR_TOO_MANY_CAPTURES; if (need >= PARSEENV_MEMENV_SIZE) { if (env->mem_alloc <= need) { if (IS_NULL(env->mem_env_dynamic)) { alloc = INIT_PARSEENV_MEMENV_ALLOC_SIZE; p = (MemEnv* )xmalloc(sizeof(MemEnv) * alloc); CHECK_NULL_RETURN_MEMERR(p); xmemcpy(p, env->mem_env_static, sizeof(env->mem_env_static)); } else { alloc = env->mem_alloc * 2; p = (MemEnv* )xrealloc(env->mem_env_dynamic, sizeof(MemEnv) * alloc); CHECK_NULL_RETURN_MEMERR(p); } for (i = env->num_mem + 1; i < alloc; i++) { p[i].mem_node = NULL_NODE; p[i].empty_repeat_node = NULL_NODE; } env->mem_env_dynamic = p; env->mem_alloc = alloc; } } env->num_mem++; return env->num_mem; } static int scan_env_set_mem_node(ParseEnv* env, int num, Node* node) { if (env->num_mem >= num) PARSEENV_MEMENV(env)[num].mem_node = node; else return ONIGERR_PARSER_BUG; return 0; } static void node_free_body(Node* node) { if (IS_NULL(node)) return ; switch (NODE_TYPE(node)) { case NODE_STRING: if (STR_(node)->capacity != 0 && IS_NOT_NULL(STR_(node)->s) && STR_(node)->s != STR_(node)->buf) { xfree(STR_(node)->s); } break; case NODE_LIST: case NODE_ALT: onig_node_free(NODE_CAR(node)); node = NODE_CDR(node); while (IS_NOT_NULL(node)) { Node* next = NODE_CDR(node); onig_node_free(NODE_CAR(node)); xfree(node); node = next; } break; case NODE_CCLASS: { CClassNode* cc = CCLASS_(node); if (cc->mbuf) bbuf_free(cc->mbuf); } break; case NODE_BACKREF: if (IS_NOT_NULL(BACKREF_(node)->back_dynamic)) xfree(BACKREF_(node)->back_dynamic); break; case NODE_BAG: if (NODE_BODY(node)) onig_node_free(NODE_BODY(node)); { BagNode* en = BAG_(node); if (en->type == BAG_IF_ELSE) { onig_node_free(en->te.Then); onig_node_free(en->te.Else); } } break; case NODE_QUANT: if (NODE_BODY(node)) onig_node_free(NODE_BODY(node)); break; case NODE_ANCHOR: if (NODE_BODY(node)) onig_node_free(NODE_BODY(node)); if (IS_NOT_NULL(ANCHOR_(node)->lead_node)) onig_node_free(ANCHOR_(node)->lead_node); break; case NODE_CTYPE: case NODE_CALL: case NODE_GIMMICK: break; } } extern void onig_node_free(Node* node) { if (IS_NULL(node)) return ; #ifdef DEBUG_NODE_FREE fprintf(stderr, "onig_node_free: %p\n", node); #endif node_free_body(node); xfree(node); } static void cons_node_free_alone(Node* node) { NODE_CAR(node) = 0; NODE_CDR(node) = 0; onig_node_free(node); } static Node* node_new(void) { Node* node; node = (Node* )xmalloc(sizeof(Node)); CHECK_NULL_RETURN(node); xmemset(node, 0, sizeof(*node)); #ifdef DEBUG_NODE_FREE fprintf(stderr, "node_new: %p\n", node); #endif return node; } extern int onig_node_copy(Node** rcopy, Node* from) { int r; Node* copy; *rcopy = NULL_NODE; switch (NODE_TYPE(from)) { case NODE_LIST: case NODE_ALT: case NODE_ANCHOR: /* These node's link to other nodes are processed by caller. */ break; case NODE_STRING: case NODE_CCLASS: case NODE_CTYPE: /* Fixed contents after copy. */ break; default: /* Not supported yet. */ return ONIGERR_TYPE_BUG; break; } copy = node_new(); CHECK_NULL_RETURN_MEMERR(copy); xmemcpy(copy, from, sizeof(*copy)); switch (NODE_TYPE(copy)) { case NODE_STRING: r = onig_node_str_set(copy, STR_(from)->s, STR_(from)->end, FALSE); if (r != 0) { err: onig_node_free(copy); return r; } break; case NODE_CCLASS: { CClassNode *fcc, *tcc; fcc = CCLASS_(from); tcc = CCLASS_(copy); if (IS_NOT_NULL(fcc->mbuf)) { r = bbuf_clone(&(tcc->mbuf), fcc->mbuf); if (r != 0) goto err; } } break; default: break; } *rcopy = copy; return ONIG_NORMAL; } static void initialize_cclass(CClassNode* cc) { BITSET_CLEAR(cc->bs); cc->flags = 0; cc->mbuf = NULL; } static Node* node_new_cclass(void) { Node* node = node_new(); CHECK_NULL_RETURN(node); NODE_SET_TYPE(node, NODE_CCLASS); initialize_cclass(CCLASS_(node)); return node; } static Node* node_new_ctype(int type, int not, OnigOptionType options) { Node* node = node_new(); CHECK_NULL_RETURN(node); NODE_SET_TYPE(node, NODE_CTYPE); CTYPE_(node)->ctype = type; CTYPE_(node)->not = not; CTYPE_(node)->ascii_mode = OPTON_IS_ASCII_MODE_CTYPE(type, options); return node; } static Node* node_new_anychar(OnigOptionType options) { Node* node; node = node_new_ctype(CTYPE_ANYCHAR, FALSE, options); CHECK_NULL_RETURN(node); if (OPTON_MULTILINE(options)) NODE_STATUS_ADD(node, MULTILINE); return node; } static int node_new_no_newline(Node** node, ParseEnv* env) { Node* n; n = node_new_anychar(ONIG_OPTION_NONE); CHECK_NULL_RETURN_MEMERR(n); *node = n; return 0; } static int node_new_true_anychar(Node** node) { Node* n; n = node_new_anychar(ONIG_OPTION_MULTILINE); CHECK_NULL_RETURN_MEMERR(n); *node = n; return 0; } static Node* node_new_list(Node* left, Node* right) { Node* node = node_new(); CHECK_NULL_RETURN(node); NODE_SET_TYPE(node, NODE_LIST); NODE_CAR(node) = left; NODE_CDR(node) = right; return node; } extern Node* onig_node_new_list(Node* left, Node* right) { return node_new_list(left, right); } extern Node* onig_node_new_alt(Node* left, Node* right) { Node* node = node_new(); CHECK_NULL_RETURN(node); NODE_SET_TYPE(node, NODE_ALT); NODE_CAR(node) = left; NODE_CDR(node) = right; return node; } static Node* make_list_or_alt(NodeType type, int n, Node* ns[]) { Node* r; if (n <= 0) return NULL_NODE; if (n == 1) { r = node_new(); CHECK_NULL_RETURN(r); NODE_SET_TYPE(r, type); NODE_CAR(r) = ns[0]; NODE_CDR(r) = NULL_NODE; } else { Node* right; r = node_new(); CHECK_NULL_RETURN(r); right = make_list_or_alt(type, n - 1, ns + 1); if (IS_NULL(right)) { onig_node_free(r); return NULL_NODE; } NODE_SET_TYPE(r, type); NODE_CAR(r) = ns[0]; NODE_CDR(r) = right; } return r; } static Node* make_list(int n, Node* ns[]) { return make_list_or_alt(NODE_LIST, n, ns); } static Node* make_alt(int n, Node* ns[]) { return make_list_or_alt(NODE_ALT, n, ns); } static Node* node_new_anchor(int type) { Node* node; node = node_new(); CHECK_NULL_RETURN(node); NODE_SET_TYPE(node, NODE_ANCHOR); ANCHOR_(node)->type = type; ANCHOR_(node)->char_min_len = 0; ANCHOR_(node)->char_max_len = INFINITE_LEN; ANCHOR_(node)->ascii_mode = 0; ANCHOR_(node)->lead_node = NULL_NODE; return node; } static Node* node_new_anchor_with_options(int type, OnigOptionType options) { int ascii_mode; Node* node; node = node_new_anchor(type); CHECK_NULL_RETURN(node); ascii_mode = OPTON_WORD_ASCII(options) && IS_WORD_ANCHOR_TYPE(type) ? 1 : 0; ANCHOR_(node)->ascii_mode = ascii_mode; if (type == ANCR_TEXT_SEGMENT_BOUNDARY || type == ANCR_NO_TEXT_SEGMENT_BOUNDARY) { if (OPTON_TEXT_SEGMENT_WORD(options)) NODE_STATUS_ADD(node, TEXT_SEGMENT_WORD); } return node; } static Node* node_new_backref(int back_num, int* backrefs, int by_name, #ifdef USE_BACKREF_WITH_LEVEL int exist_level, int nest_level, #endif ParseEnv* env) { int i; Node* node; node = node_new(); CHECK_NULL_RETURN(node); NODE_SET_TYPE(node, NODE_BACKREF); BACKREF_(node)->back_num = back_num; BACKREF_(node)->back_dynamic = (int* )NULL; if (by_name != 0) NODE_STATUS_ADD(node, BY_NAME); if (OPTON_IGNORECASE(env->options)) NODE_STATUS_ADD(node, IGNORECASE); #ifdef USE_BACKREF_WITH_LEVEL if (exist_level != 0) { NODE_STATUS_ADD(node, NEST_LEVEL); BACKREF_(node)->nest_level = nest_level; } #endif for (i = 0; i < back_num; i++) { if (backrefs[i] <= env->num_mem && IS_NULL(PARSEENV_MEMENV(env)[backrefs[i]].mem_node)) { NODE_STATUS_ADD(node, RECURSION); /* /...(\1).../ */ break; } } if (back_num <= NODE_BACKREFS_SIZE) { for (i = 0; i < back_num; i++) BACKREF_(node)->back_static[i] = backrefs[i]; } else { int* p = (int* )xmalloc(sizeof(int) * back_num); if (IS_NULL(p)) { onig_node_free(node); return NULL; } BACKREF_(node)->back_dynamic = p; for (i = 0; i < back_num; i++) p[i] = backrefs[i]; } env->backref_num++; return node; } static Node* node_new_backref_checker(int back_num, int* backrefs, int by_name, #ifdef USE_BACKREF_WITH_LEVEL int exist_level, int nest_level, #endif ParseEnv* env) { Node* node; node = node_new_backref(back_num, backrefs, by_name, #ifdef USE_BACKREF_WITH_LEVEL exist_level, nest_level, #endif env); CHECK_NULL_RETURN(node); NODE_STATUS_ADD(node, CHECKER); return node; } #ifdef USE_CALL static Node* node_new_call(UChar* name, UChar* name_end, int gnum, int by_number) { Node* node = node_new(); CHECK_NULL_RETURN(node); NODE_SET_TYPE(node, NODE_CALL); CALL_(node)->by_number = by_number; CALL_(node)->name = name; CALL_(node)->name_end = name_end; CALL_(node)->called_gnum = gnum; CALL_(node)->entry_count = 1; return node; } #endif static Node* node_new_quantifier(int lower, int upper, int by_number) { Node* node = node_new(); CHECK_NULL_RETURN(node); NODE_SET_TYPE(node, NODE_QUANT); QUANT_(node)->lower = lower; QUANT_(node)->upper = upper; QUANT_(node)->greedy = 1; QUANT_(node)->emptiness = BODY_IS_NOT_EMPTY; QUANT_(node)->head_exact = NULL_NODE; QUANT_(node)->next_head_exact = NULL_NODE; QUANT_(node)->include_referred = 0; QUANT_(node)->empty_status_mem = 0; if (by_number != 0) NODE_STATUS_ADD(node, BY_NUMBER); return node; } static Node* node_new_bag(enum BagType type) { Node* node = node_new(); CHECK_NULL_RETURN(node); NODE_SET_TYPE(node, NODE_BAG); BAG_(node)->type = type; switch (type) { case BAG_MEMORY: BAG_(node)->m.regnum = 0; BAG_(node)->m.called_addr = -1; BAG_(node)->m.entry_count = 1; BAG_(node)->m.called_state = 0; break; case BAG_OPTION: BAG_(node)->o.options = 0; break; case BAG_STOP_BACKTRACK: break; case BAG_IF_ELSE: BAG_(node)->te.Then = 0; BAG_(node)->te.Else = 0; break; } BAG_(node)->opt_count = 0; return node; } extern Node* onig_node_new_bag(enum BagType type) { return node_new_bag(type); } static Node* node_new_bag_if_else(Node* cond, Node* Then, Node* Else) { Node* n; n = node_new_bag(BAG_IF_ELSE); CHECK_NULL_RETURN(n); NODE_BODY(n) = cond; BAG_(n)->te.Then = Then; BAG_(n)->te.Else = Else; return n; } static Node* node_new_memory(int is_named) { Node* node = node_new_bag(BAG_MEMORY); CHECK_NULL_RETURN(node); if (is_named != 0) NODE_STATUS_ADD(node, NAMED_GROUP); return node; } static Node* node_new_option(OnigOptionType option) { Node* node = node_new_bag(BAG_OPTION); CHECK_NULL_RETURN(node); BAG_(node)->o.options = option; return node; } static Node* node_new_group(Node* content) { Node* node; node = node_new(); CHECK_NULL_RETURN(node); NODE_SET_TYPE(node, NODE_LIST); NODE_CAR(node) = content; NODE_CDR(node) = NULL_NODE; return node; } static Node* node_drop_group(Node* group) { Node* content; content = NODE_CAR(group); NODE_CAR(group) = NULL_NODE; onig_node_free(group); return content; } static int node_set_fail(Node* node) { NODE_SET_TYPE(node, NODE_GIMMICK); GIMMICK_(node)->type = GIMMICK_FAIL; return ONIG_NORMAL; } static int node_new_fail(Node** node, ParseEnv* env) { *node = node_new(); CHECK_NULL_RETURN_MEMERR(*node); return node_set_fail(*node); } extern int onig_node_reset_fail(Node* node) { node_free_body(node); return node_set_fail(node); } static int node_new_save_gimmick(Node** node, enum SaveType save_type, ParseEnv* env) { int id; ID_ENTRY(env, id); *node = node_new(); CHECK_NULL_RETURN_MEMERR(*node); NODE_SET_TYPE(*node, NODE_GIMMICK); GIMMICK_(*node)->id = id; GIMMICK_(*node)->type = GIMMICK_SAVE; GIMMICK_(*node)->detail_type = (int )save_type; return ONIG_NORMAL; } static int node_new_update_var_gimmick(Node** node, enum UpdateVarType update_var_type, int id, ParseEnv* env) { *node = node_new(); CHECK_NULL_RETURN_MEMERR(*node); NODE_SET_TYPE(*node, NODE_GIMMICK); GIMMICK_(*node)->id = id; GIMMICK_(*node)->type = GIMMICK_UPDATE_VAR; GIMMICK_(*node)->detail_type = (int )update_var_type; return ONIG_NORMAL; } static int node_new_keep(Node** node, ParseEnv* env) { int r; r = node_new_save_gimmick(node, SAVE_KEEP, env); if (r != 0) return r; env->keep_num++; return ONIG_NORMAL; } #ifdef USE_CALLOUT extern void onig_free_reg_callout_list(int n, CalloutListEntry* list) { int i; int j; if (IS_NULL(list)) return ; for (i = 0; i < n; i++) { if (list[i].of == ONIG_CALLOUT_OF_NAME) { for (j = 0; j < list[i].u.arg.passed_num; j++) { if (list[i].u.arg.types[j] == ONIG_TYPE_STRING) { if (IS_NOT_NULL(list[i].u.arg.vals[j].s.start)) xfree(list[i].u.arg.vals[j].s.start); } } } else { /* ONIG_CALLOUT_OF_CONTENTS */ if (IS_NOT_NULL(list[i].u.content.start)) { xfree((void* )list[i].u.content.start); } } } xfree(list); } extern CalloutListEntry* onig_reg_callout_list_at(regex_t* reg, int num) { RegexExt* ext = reg->extp; CHECK_NULL_RETURN(ext); if (num <= 0 || num > ext->callout_num) return 0; num--; return ext->callout_list + num; } static int reg_callout_list_entry(ParseEnv* env, int* rnum) { #define INIT_CALLOUT_LIST_NUM 3 int num; CalloutListEntry* list; CalloutListEntry* e; RegexExt* ext; ext = onig_get_regex_ext(env->reg); CHECK_NULL_RETURN_MEMERR(ext); if (IS_NULL(ext->callout_list)) { list = (CalloutListEntry* )xmalloc(sizeof(*list) * INIT_CALLOUT_LIST_NUM); CHECK_NULL_RETURN_MEMERR(list); ext->callout_list = list; ext->callout_list_alloc = INIT_CALLOUT_LIST_NUM; ext->callout_num = 0; } num = ext->callout_num + 1; if (num > ext->callout_list_alloc) { int alloc = ext->callout_list_alloc * 2; list = (CalloutListEntry* )xrealloc(ext->callout_list, sizeof(CalloutListEntry) * alloc); CHECK_NULL_RETURN_MEMERR(list); ext->callout_list = list; ext->callout_list_alloc = alloc; } e = ext->callout_list + (num - 1); e->flag = 0; e->of = 0; e->in = ONIG_CALLOUT_OF_CONTENTS; e->type = 0; e->tag_start = 0; e->tag_end = 0; e->start_func = 0; e->end_func = 0; e->u.arg.num = 0; e->u.arg.passed_num = 0; ext->callout_num = num; *rnum = num; return ONIG_NORMAL; } static int node_new_callout(Node** node, OnigCalloutOf callout_of, int num, int id, ParseEnv* env) { *node = node_new(); CHECK_NULL_RETURN_MEMERR(*node); NODE_SET_TYPE(*node, NODE_GIMMICK); GIMMICK_(*node)->id = id; GIMMICK_(*node)->num = num; GIMMICK_(*node)->type = GIMMICK_CALLOUT; GIMMICK_(*node)->detail_type = (int )callout_of; return ONIG_NORMAL; } #endif static int make_text_segment(Node** node, ParseEnv* env) { int r; int i; Node* x; Node* ns[2]; /* \X == (?>\O(?:\Y\O)*) */ ns[1] = NULL_NODE; r = ONIGERR_MEMORY; ns[0] = node_new_anchor_with_options(ANCR_NO_TEXT_SEGMENT_BOUNDARY, env->options); if (IS_NULL(ns[0])) goto err; r = node_new_true_anychar(&ns[1]); if (r != 0) goto err1; x = make_list(2, ns); if (IS_NULL(x)) goto err; ns[0] = x; ns[1] = NULL_NODE; x = node_new_quantifier(0, INFINITE_REPEAT, TRUE); if (IS_NULL(x)) goto err; NODE_BODY(x) = ns[0]; ns[0] = NULL_NODE; ns[1] = x; r = node_new_true_anychar(&ns[0]); if (r != 0) goto err1; x = make_list(2, ns); if (IS_NULL(x)) goto err; ns[0] = x; ns[1] = NULL_NODE; x = node_new_bag(BAG_STOP_BACKTRACK); if (IS_NULL(x)) goto err; NODE_BODY(x) = ns[0]; *node = x; return ONIG_NORMAL; err: r = ONIGERR_MEMORY; err1: for (i = 0; i < 2; i++) onig_node_free(ns[i]); return r; } static int make_absent_engine(Node** node, int pre_save_right_id, Node* absent, Node* step_one, int lower, int upper, int possessive, int is_range_cutter, ParseEnv* env) { int r; int i; int id; Node* x; Node* ns[4]; for (i = 0; i < 4; i++) ns[i] = NULL_NODE; ns[1] = absent; ns[3] = step_one; /* for err */ r = node_new_save_gimmick(&ns[0], SAVE_S, env); if (r != 0) goto err; id = GIMMICK_(ns[0])->id; r = node_new_update_var_gimmick(&ns[2], UPDATE_VAR_RIGHT_RANGE_FROM_S_STACK, id, env); if (r != 0) goto err; if (is_range_cutter != 0) NODE_STATUS_ADD(ns[2], ABSENT_WITH_SIDE_EFFECTS); r = node_new_fail(&ns[3], env); if (r != 0) goto err; x = make_list(4, ns); if (IS_NULL(x)) goto err0; ns[0] = x; ns[1] = step_one; ns[2] = ns[3] = NULL_NODE; x = make_alt(2, ns); if (IS_NULL(x)) goto err0; ns[0] = x; x = node_new_quantifier(lower, upper, FALSE); if (IS_NULL(x)) goto err0; NODE_BODY(x) = ns[0]; ns[0] = x; if (possessive != 0) { x = node_new_bag(BAG_STOP_BACKTRACK); if (IS_NULL(x)) goto err0; NODE_BODY(x) = ns[0]; ns[0] = x; } r = node_new_update_var_gimmick(&ns[1], UPDATE_VAR_RIGHT_RANGE_FROM_STACK, pre_save_right_id, env); if (r != 0) goto err; r = node_new_fail(&ns[2], env); if (r != 0) goto err; x = make_list(2, ns + 1); if (IS_NULL(x)) goto err0; ns[1] = x; ns[2] = NULL_NODE; x = make_alt(2, ns); if (IS_NULL(x)) goto err0; if (is_range_cutter != FALSE) NODE_STATUS_ADD(x, SUPER); *node = x; return ONIG_NORMAL; err0: r = ONIGERR_MEMORY; err: for (i = 0; i < 4; i++) onig_node_free(ns[i]); return r; } static int make_absent_tail(Node** node1, Node** node2, int pre_save_right_id, ParseEnv* env) { int r; int id; Node* save; Node* x; Node* ns[2]; *node1 = *node2 = NULL_NODE; save = ns[0] = ns[1] = NULL_NODE; r = node_new_save_gimmick(&save, SAVE_RIGHT_RANGE, env); if (r != 0) goto err; id = GIMMICK_(save)->id; r = node_new_update_var_gimmick(&ns[0], UPDATE_VAR_RIGHT_RANGE_FROM_STACK, id, env); if (r != 0) goto err; r = node_new_fail(&ns[1], env); if (r != 0) goto err; x = make_list(2, ns); if (IS_NULL(x)) goto err0; ns[0] = NULL_NODE; ns[1] = x; r = node_new_update_var_gimmick(&ns[0], UPDATE_VAR_RIGHT_RANGE_FROM_STACK, pre_save_right_id, env); if (r != 0) goto err; x = make_alt(2, ns); if (IS_NULL(x)) goto err0; *node1 = save; *node2 = x; return ONIG_NORMAL; err0: r = ONIGERR_MEMORY; err: onig_node_free(save); onig_node_free(ns[0]); onig_node_free(ns[1]); return r; } static int make_range_clear(Node** node, ParseEnv* env) { int r; int id; Node* save; Node* x; Node* ns[2]; *node = NULL_NODE; save = ns[0] = ns[1] = NULL_NODE; r = node_new_save_gimmick(&save, SAVE_RIGHT_RANGE, env); if (r != 0) goto err; id = GIMMICK_(save)->id; r = node_new_update_var_gimmick(&ns[0], UPDATE_VAR_RIGHT_RANGE_FROM_STACK, id, env); if (r != 0) goto err; r = node_new_fail(&ns[1], env); if (r != 0) goto err; x = make_list(2, ns); if (IS_NULL(x)) goto err0; ns[0] = NULL_NODE; ns[1] = x; #define ID_NOT_USED_DONT_CARE_ME 0 r = node_new_update_var_gimmick(&ns[0], UPDATE_VAR_RIGHT_RANGE_INIT, ID_NOT_USED_DONT_CARE_ME, env); if (r != 0) goto err; NODE_STATUS_ADD(ns[0], ABSENT_WITH_SIDE_EFFECTS); x = make_alt(2, ns); if (IS_NULL(x)) goto err0; NODE_STATUS_ADD(x, SUPER); ns[0] = save; ns[1] = x; save = NULL_NODE; x = make_list(2, ns); if (IS_NULL(x)) goto err0; *node = x; return ONIG_NORMAL; err0: r = ONIGERR_MEMORY; err: onig_node_free(save); onig_node_free(ns[0]); onig_node_free(ns[1]); return r; } static int is_simple_one_char_repeat(Node* node, Node** rquant, Node** rbody, int* is_possessive, ParseEnv* env) { Node* quant; Node* body; *rquant = *rbody = 0; *is_possessive = 0; if (NODE_TYPE(node) == NODE_QUANT) { quant = node; } else { if (NODE_TYPE(node) == NODE_BAG) { BagNode* en = BAG_(node); if (en->type == BAG_STOP_BACKTRACK) { *is_possessive = 1; quant = NODE_BAG_BODY(en); if (NODE_TYPE(quant) != NODE_QUANT) return 0; } else return 0; } else return 0; } if (QUANT_(quant)->greedy == 0) return 0; body = NODE_BODY(quant); switch (NODE_TYPE(body)) { case NODE_STRING: { int len; StrNode* sn = STR_(body); UChar *s = sn->s; len = 0; while (s < sn->end) { s += enclen(env->enc, s); len++; } if (len != 1) return 0; } case NODE_CCLASS: break; default: return 0; break; } if (node != quant) { NODE_BODY(node) = 0; onig_node_free(node); } NODE_BODY(quant) = NULL_NODE; *rquant = quant; *rbody = body; return 1; } static int make_absent_tree_for_simple_one_char_repeat(Node** node, Node* absent, Node* quant, Node* body, int possessive, ParseEnv* env) { int r; int i; int id1; int lower, upper; Node* x; Node* ns[4]; *node = NULL_NODE; r = ONIGERR_MEMORY; ns[0] = ns[1] = NULL_NODE; ns[2] = body, ns[3] = absent; lower = QUANT_(quant)->lower; upper = QUANT_(quant)->upper; r = node_new_save_gimmick(&ns[0], SAVE_RIGHT_RANGE, env); if (r != 0) goto err; id1 = GIMMICK_(ns[0])->id; r = make_absent_engine(&ns[1], id1, absent, body, lower, upper, possessive, FALSE, env); if (r != 0) goto err; ns[2] = ns[3] = NULL_NODE; r = node_new_update_var_gimmick(&ns[2], UPDATE_VAR_RIGHT_RANGE_FROM_STACK, id1, env); if (r != 0) goto err; x = make_list(3, ns); if (IS_NULL(x)) goto err0; *node = x; return ONIG_NORMAL; err0: r = ONIGERR_MEMORY; err: for (i = 0; i < 4; i++) onig_node_free(ns[i]); return r; } static int make_absent_tree(Node** node, Node* absent, Node* expr, int is_range_cutter, ParseEnv* env) { int r; int i; int id1, id2; int possessive; Node* x; Node* ns[7]; r = ONIGERR_MEMORY; for (i = 0; i < 7; i++) ns[i] = NULL_NODE; ns[4] = expr; ns[5] = absent; if (is_range_cutter == 0) { Node* quant; Node* body; if (expr == NULL_NODE) { /* default expr \O* */ quant = node_new_quantifier(0, INFINITE_REPEAT, FALSE); if (IS_NULL(quant)) goto err0; r = node_new_true_anychar(&body); if (r != 0) { onig_node_free(quant); goto err; } possessive = 0; goto simple; } else { if (is_simple_one_char_repeat(expr, &quant, &body, &possessive, env)) { simple: r = make_absent_tree_for_simple_one_char_repeat(node, absent, quant, body, possessive, env); onig_node_free(quant); if (r != 0) { ns[4] = NULL_NODE; onig_node_free(body); goto err; } return ONIG_NORMAL; } } } r = node_new_save_gimmick(&ns[0], SAVE_RIGHT_RANGE, env); if (r != 0) goto err; id1 = GIMMICK_(ns[0])->id; r = node_new_save_gimmick(&ns[1], SAVE_S, env); if (r != 0) goto err; id2 = GIMMICK_(ns[1])->id; r = node_new_true_anychar(&ns[3]); if (r != 0) goto err; possessive = 1; r = make_absent_engine(&ns[2], id1, absent, ns[3], 0, INFINITE_REPEAT, possessive, is_range_cutter, env); if (r != 0) goto err; ns[3] = NULL_NODE; ns[5] = NULL_NODE; r = node_new_update_var_gimmick(&ns[3], UPDATE_VAR_S_FROM_STACK, id2, env); if (r != 0) goto err; if (is_range_cutter != 0) { x = make_list(4, ns); if (IS_NULL(x)) goto err0; } else { r = make_absent_tail(&ns[5], &ns[6], id1, env); if (r != 0) goto err; x = make_list(7, ns); if (IS_NULL(x)) goto err0; } *node = x; return ONIG_NORMAL; err0: r = ONIGERR_MEMORY; err: for (i = 0; i < 7; i++) onig_node_free(ns[i]); return r; } extern int onig_node_str_cat(Node* node, const UChar* s, const UChar* end) { int addlen = (int )(end - s); if (addlen > 0) { int len = (int )(STR_(node)->end - STR_(node)->s); if (STR_(node)->capacity > 0 || (len + addlen > NODE_STRING_BUF_SIZE - 1)) { UChar* p; int capa = len + addlen + NODE_STRING_MARGIN; if (capa <= STR_(node)->capacity) { onig_strcpy(STR_(node)->s + len, s, end); } else { if (STR_(node)->s == STR_(node)->buf) p = strcat_capa_from_static(STR_(node)->s, STR_(node)->end, s, end, capa); else p = strcat_capa(STR_(node)->s, STR_(node)->end, s, end, capa); CHECK_NULL_RETURN_MEMERR(p); STR_(node)->s = p; STR_(node)->capacity = capa; } } else { onig_strcpy(STR_(node)->s + len, s, end); } STR_(node)->end = STR_(node)->s + len + addlen; } return 0; } extern int onig_node_str_set(Node* node, const UChar* s, const UChar* end, int need_free) { onig_node_str_clear(node, need_free); return onig_node_str_cat(node, s, end); } static int node_str_cat_char(Node* node, UChar c) { UChar s[1]; s[0] = c; return onig_node_str_cat(node, s, s + 1); } extern void onig_node_str_clear(Node* node, int need_free) { if (need_free != 0 && STR_(node)->capacity != 0 && IS_NOT_NULL(STR_(node)->s) && STR_(node)->s != STR_(node)->buf) { xfree(STR_(node)->s); } STR_(node)->flag = 0; STR_(node)->s = STR_(node)->buf; STR_(node)->end = STR_(node)->buf; STR_(node)->capacity = 0; } static int node_set_str(Node* node, const UChar* s, const UChar* end) { int r; NODE_SET_TYPE(node, NODE_STRING); STR_(node)->flag = 0; STR_(node)->s = STR_(node)->buf; STR_(node)->end = STR_(node)->buf; STR_(node)->capacity = 0; r = onig_node_str_cat(node, s, end); return r; } static Node* node_new_str(const UChar* s, const UChar* end) { int r; Node* node = node_new(); CHECK_NULL_RETURN(node); r = node_set_str(node, s, end); if (r != 0) { onig_node_free(node); return NULL; } return node; } static int node_reset_str(Node* node, const UChar* s, const UChar* end) { node_free_body(node); return node_set_str(node, s, end); } extern int onig_node_reset_empty(Node* node) { return node_reset_str(node, NULL, NULL); } extern Node* onig_node_new_str(const UChar* s, const UChar* end) { return node_new_str(s, end); } static Node* node_new_str_with_options(const UChar* s, const UChar* end, OnigOptionType options) { Node* node; node = node_new_str(s, end); if (OPTON_IGNORECASE(options)) NODE_STATUS_ADD(node, IGNORECASE); return node; } static Node* node_new_str_crude(UChar* s, UChar* end, OnigOptionType options) { Node* node = node_new_str_with_options(s, end, options); CHECK_NULL_RETURN(node); NODE_STRING_SET_CRUDE(node); return node; } static Node* node_new_empty(void) { return node_new_str(NULL, NULL); } static Node* node_new_str_crude_char(UChar c, OnigOptionType options) { int i; UChar p[1]; Node* node; p[0] = c; node = node_new_str_crude(p, p + 1, options); /* clear buf tail */ for (i = 1; i < NODE_STRING_BUF_SIZE; i++) STR_(node)->buf[i] = '\0'; return node; } static Node* str_node_split_last_char(Node* node, OnigEncoding enc) { const UChar *p; Node* rn; StrNode* sn; sn = STR_(node); rn = NULL_NODE; if (sn->end > sn->s) { p = onigenc_get_prev_char_head(enc, sn->s, sn->end); if (p && p > sn->s) { /* can be split. */ rn = node_new_str(p, sn->end); CHECK_NULL_RETURN(rn); sn->end = (UChar* )p; STR_(rn)->flag = sn->flag; NODE_STATUS(rn) = NODE_STATUS(node); } } return rn; } static int str_node_can_be_split(Node* node, OnigEncoding enc) { StrNode* sn = STR_(node); if (sn->end > sn->s) { return ((enclen(enc, sn->s) < sn->end - sn->s) ? 1 : 0); } return 0; } static int scan_number(UChar** src, const UChar* end, OnigEncoding enc) { int num, val; OnigCodePoint c; UChar* p; PFETCH_READY; p = *src; num = 0; while (! PEND) { PFETCH(c); if (IS_CODE_DIGIT_ASCII(enc, c)) { val = (int )DIGITVAL(c); if ((ONIG_INT_MAX - val) / 10 < num) return -1; /* overflow */ num = num * 10 + val; } else { PUNFETCH; break; } } *src = p; return num; } static int scan_hexadecimal_number(UChar** src, UChar* end, int minlen, int maxlen, OnigEncoding enc, OnigCodePoint* rcode) { OnigCodePoint code; OnigCodePoint c; unsigned int val; int n; UChar* p; PFETCH_READY; p = *src; code = 0; n = 0; while (! PEND && n < maxlen) { PFETCH(c); if (IS_CODE_XDIGIT_ASCII(enc, c)) { n++; val = (unsigned int )XDIGITVAL(enc, c); if ((UINT_MAX - val) / 16UL < code) return ONIGERR_TOO_BIG_NUMBER; /* overflow */ code = (code << 4) + val; } else { PUNFETCH; break; } } if (n < minlen) return ONIGERR_INVALID_CODE_POINT_VALUE; *rcode = code; *src = p; return ONIG_NORMAL; } static int scan_octal_number(UChar** src, UChar* end, int minlen, int maxlen, OnigEncoding enc, OnigCodePoint* rcode) { OnigCodePoint code; OnigCodePoint c; unsigned int val; int n; UChar* p; PFETCH_READY; p = *src; code = 0; n = 0; while (! PEND && n < maxlen) { PFETCH(c); if (IS_CODE_DIGIT_ASCII(enc, c) && c < '8') { n++; val = (unsigned int )ODIGITVAL(c); if ((UINT_MAX - val) / 8UL < code) return ONIGERR_TOO_BIG_NUMBER; /* overflow */ code = (code << 3) + val; } else { PUNFETCH; break; } } if (n < minlen) return ONIGERR_INVALID_CODE_POINT_VALUE; *rcode = code; *src = p; return ONIG_NORMAL; } static int scan_number_of_base(UChar** src, UChar* end, int minlen, OnigEncoding enc, OnigCodePoint* rcode, int base) { int r; if (base == 16) r = scan_hexadecimal_number(src, end, minlen, 8, enc, rcode); else if (base == 8) r = scan_octal_number(src, end, minlen, 11, enc, rcode); else r = ONIGERR_INVALID_CODE_POINT_VALUE; return r; } #define IS_CODE_POINT_DIVIDE(c) ((c) == ' ' || (c) == '\n') enum CPS_STATE { CPS_EMPTY = 0, CPS_START = 1, CPS_RANGE = 2 }; static int check_code_point_sequence_cc(UChar* p, UChar* end, int base, OnigEncoding enc, int state) { int r; int n; int end_digit; OnigCodePoint code; OnigCodePoint c; PFETCH_READY; end_digit = FALSE; n = 0; while (! PEND) { start: PFETCH(c); if (c == '}') { end_char: if (state == CPS_RANGE) return ONIGERR_INVALID_CODE_POINT_VALUE; return n; } if (IS_CODE_POINT_DIVIDE(c)) { while (! PEND) { PFETCH(c); if (! IS_CODE_POINT_DIVIDE(c)) break; } if (IS_CODE_POINT_DIVIDE(c)) return ONIGERR_INVALID_CODE_POINT_VALUE; } else if (c == '-') { range: if (state != CPS_START) return ONIGERR_INVALID_CODE_POINT_VALUE; if (PEND) return ONIGERR_INVALID_CODE_POINT_VALUE; end_digit = FALSE; state = CPS_RANGE; goto start; } else if (end_digit == TRUE) { if (base == 16) { if (IS_CODE_XDIGIT_ASCII(enc, c)) return ONIGERR_TOO_LONG_WIDE_CHAR_VALUE; } else if (base == 8) { if (IS_CODE_DIGIT_ASCII(enc, c) && c < '8') return ONIGERR_TOO_LONG_WIDE_CHAR_VALUE; } return ONIGERR_INVALID_CODE_POINT_VALUE; } if (c == '}') goto end_char; if (c == '-') goto range; PUNFETCH; r = scan_number_of_base(&p, end, 1, enc, &code, base); if (r != 0) return r; n++; end_digit = TRUE; state = (state == CPS_RANGE) ? CPS_EMPTY : CPS_START; } return ONIGERR_INVALID_CODE_POINT_VALUE; } static int check_code_point_sequence(UChar* p, UChar* end, int base, OnigEncoding enc) { int r; int n; int end_digit; OnigCodePoint code; OnigCodePoint c; PFETCH_READY; end_digit = FALSE; n = 0; while (! PEND) { PFETCH(c); if (c == '}') { end_char: return n; } if (IS_CODE_POINT_DIVIDE(c)) { while (! PEND) { PFETCH(c); if (! IS_CODE_POINT_DIVIDE(c)) break; } if (IS_CODE_POINT_DIVIDE(c)) return ONIGERR_INVALID_CODE_POINT_VALUE; } else if (end_digit == TRUE) { if (base == 16) { if (IS_CODE_XDIGIT_ASCII(enc, c)) return ONIGERR_TOO_LONG_WIDE_CHAR_VALUE; } else if (base == 8) { if (IS_CODE_DIGIT_ASCII(enc, c) && c < '8') return ONIGERR_TOO_LONG_WIDE_CHAR_VALUE; } return ONIGERR_INVALID_CODE_POINT_VALUE; } if (c == '}') goto end_char; PUNFETCH; r = scan_number_of_base(&p, end, 1, enc, &code, base); if (r != 0) return r; n++; end_digit = TRUE; } return ONIGERR_INVALID_CODE_POINT_VALUE; } static int get_next_code_point(UChar** src, UChar* end, int base, OnigEncoding enc, int in_cc, OnigCodePoint* rcode) { int r; OnigCodePoint c; UChar* p; PFETCH_READY; p = *src; while (! PEND) { PFETCH(c); if (! IS_CODE_POINT_DIVIDE(c)) { if (c == '}') { *src = p; return 1; /* end of sequence */ } else if (c == '-' && in_cc == TRUE) { *src = p; return 2; /* range */ } PUNFETCH; break; } else { if (PEND) return ONIGERR_INVALID_CODE_POINT_VALUE; } } r = scan_number_of_base(&p, end, 1, enc, rcode, base); if (r != 0) return r; *src = p; return ONIG_NORMAL; } #define BB_WRITE_CODE_POINT(bbuf,pos,code) \ BB_WRITE(bbuf, pos, &(code), SIZE_CODE_POINT) /* data format: [n][from-1][to-1][from-2][to-2] ... [from-n][to-n] (all data size is OnigCodePoint) */ static int new_code_range(BBuf** pbuf) { #define INIT_MULTI_BYTE_RANGE_SIZE (SIZE_CODE_POINT * 5) int r; OnigCodePoint n; BBuf* bbuf; bbuf = *pbuf = (BBuf* )xmalloc(sizeof(BBuf)); CHECK_NULL_RETURN_MEMERR(bbuf); r = BB_INIT(bbuf, INIT_MULTI_BYTE_RANGE_SIZE); if (r != 0) { xfree(bbuf); *pbuf = 0; return r; } n = 0; BB_WRITE_CODE_POINT(bbuf, 0, n); return 0; } static int add_code_range_to_buf(BBuf** pbuf, OnigCodePoint from, OnigCodePoint to) { int r, inc_n, pos; int low, high, bound, x; OnigCodePoint n, *data; BBuf* bbuf; if (from > to) { n = from; from = to; to = n; } if (IS_NULL(*pbuf)) { r = new_code_range(pbuf); if (r != 0) return r; bbuf = *pbuf; n = 0; } else { bbuf = *pbuf; GET_CODE_POINT(n, bbuf->p); } data = (OnigCodePoint* )(bbuf->p); data++; for (low = 0, bound = n; low < bound; ) { x = (low + bound) >> 1; if (from > data[x*2 + 1]) low = x + 1; else bound = x; } high = (to == ~((OnigCodePoint )0)) ? n : low; for (bound = n; high < bound; ) { x = (high + bound) >> 1; if (to + 1 >= data[x*2]) high = x + 1; else bound = x; } inc_n = low + 1 - high; if (n + inc_n > ONIG_MAX_MULTI_BYTE_RANGES_NUM) return ONIGERR_TOO_MANY_MULTI_BYTE_RANGES; if (inc_n != 1) { if (from > data[low*2]) from = data[low*2]; if (to < data[(high - 1)*2 + 1]) to = data[(high - 1)*2 + 1]; } if (inc_n != 0 && (OnigCodePoint )high < n) { int from_pos = SIZE_CODE_POINT * (1 + high * 2); int to_pos = SIZE_CODE_POINT * (1 + (low + 1) * 2); int size = (n - high) * 2 * SIZE_CODE_POINT; if (inc_n > 0) { BB_MOVE_RIGHT(bbuf, from_pos, to_pos, size); } else { BB_MOVE_LEFT_REDUCE(bbuf, from_pos, to_pos); } } pos = SIZE_CODE_POINT * (1 + low * 2); BB_ENSURE_SIZE(bbuf, pos + SIZE_CODE_POINT * 2); BB_WRITE_CODE_POINT(bbuf, pos, from); BB_WRITE_CODE_POINT(bbuf, pos + SIZE_CODE_POINT, to); n += inc_n; BB_WRITE_CODE_POINT(bbuf, 0, n); return 0; } static int add_code_range(BBuf** pbuf, ParseEnv* env, OnigCodePoint from, OnigCodePoint to) { if (from > to) { if (IS_SYNTAX_BV(env->syntax, ONIG_SYN_ALLOW_EMPTY_RANGE_IN_CC)) return 0; else return ONIGERR_EMPTY_RANGE_IN_CHAR_CLASS; } return add_code_range_to_buf(pbuf, from, to); } static int not_code_range_buf(OnigEncoding enc, BBuf* bbuf, BBuf** pbuf) { int r, i, n; OnigCodePoint pre, from, *data, to = 0; *pbuf = (BBuf* )NULL; if (IS_NULL(bbuf)) { set_all: return SET_ALL_MULTI_BYTE_RANGE(enc, pbuf); } data = (OnigCodePoint* )(bbuf->p); GET_CODE_POINT(n, data); data++; if (n <= 0) goto set_all; r = 0; pre = MBCODE_START_POS(enc); for (i = 0; i < n; i++) { from = data[i*2]; to = data[i*2+1]; if (pre <= from - 1) { r = add_code_range_to_buf(pbuf, pre, from - 1); if (r != 0) { bbuf_free(*pbuf); return r; } } if (to == ~((OnigCodePoint )0)) break; pre = to + 1; } if (to < ~((OnigCodePoint )0)) { r = add_code_range_to_buf(pbuf, to + 1, ~((OnigCodePoint )0)); if (r != 0) bbuf_free(*pbuf); } return r; } #define SWAP_BB_NOT(bbuf1, not1, bbuf2, not2) do {\ BBuf *tbuf; \ int tnot; \ tnot = not1; not1 = not2; not2 = tnot; \ tbuf = bbuf1; bbuf1 = bbuf2; bbuf2 = tbuf; \ } while (0) static int or_code_range_buf(OnigEncoding enc, BBuf* bbuf1, int not1, BBuf* bbuf2, int not2, BBuf** pbuf) { int r; OnigCodePoint i, n1, *data1; OnigCodePoint from, to; *pbuf = (BBuf* )NULL; if (IS_NULL(bbuf1) && IS_NULL(bbuf2)) { if (not1 != 0 || not2 != 0) return SET_ALL_MULTI_BYTE_RANGE(enc, pbuf); return 0; } r = 0; if (IS_NULL(bbuf2)) SWAP_BB_NOT(bbuf1, not1, bbuf2, not2); if (IS_NULL(bbuf1)) { if (not1 != 0) { return SET_ALL_MULTI_BYTE_RANGE(enc, pbuf); } else { if (not2 == 0) { return bbuf_clone(pbuf, bbuf2); } else { return not_code_range_buf(enc, bbuf2, pbuf); } } } if (not1 != 0) SWAP_BB_NOT(bbuf1, not1, bbuf2, not2); data1 = (OnigCodePoint* )(bbuf1->p); GET_CODE_POINT(n1, data1); data1++; if (not2 == 0 && not1 == 0) { /* 1 OR 2 */ r = bbuf_clone(pbuf, bbuf2); } else if (not1 == 0) { /* 1 OR (not 2) */ r = not_code_range_buf(enc, bbuf2, pbuf); } if (r != 0) return r; for (i = 0; i < n1; i++) { from = data1[i*2]; to = data1[i*2+1]; r = add_code_range_to_buf(pbuf, from, to); if (r != 0) return r; } return 0; } static int and_code_range1(BBuf** pbuf, OnigCodePoint from1, OnigCodePoint to1, OnigCodePoint* data, int n) { int i, r; OnigCodePoint from2, to2; for (i = 0; i < n; i++) { from2 = data[i*2]; to2 = data[i*2+1]; if (from2 < from1) { if (to2 < from1) continue; else { from1 = to2 + 1; } } else if (from2 <= to1) { if (to2 < to1) { if (from1 <= from2 - 1) { r = add_code_range_to_buf(pbuf, from1, from2-1); if (r != 0) return r; } from1 = to2 + 1; } else { to1 = from2 - 1; } } else { from1 = from2; } if (from1 > to1) break; } if (from1 <= to1) { r = add_code_range_to_buf(pbuf, from1, to1); if (r != 0) return r; } return 0; } static int and_code_range_buf(BBuf* bbuf1, int not1, BBuf* bbuf2, int not2, BBuf** pbuf) { int r; OnigCodePoint i, j, n1, n2, *data1, *data2; OnigCodePoint from, to, from1, to1, from2, to2; *pbuf = (BBuf* )NULL; if (IS_NULL(bbuf1)) { if (not1 != 0 && IS_NOT_NULL(bbuf2)) /* not1 != 0 -> not2 == 0 */ return bbuf_clone(pbuf, bbuf2); return 0; } else if (IS_NULL(bbuf2)) { if (not2 != 0) return bbuf_clone(pbuf, bbuf1); return 0; } if (not1 != 0) SWAP_BB_NOT(bbuf1, not1, bbuf2, not2); data1 = (OnigCodePoint* )(bbuf1->p); data2 = (OnigCodePoint* )(bbuf2->p); GET_CODE_POINT(n1, data1); GET_CODE_POINT(n2, data2); data1++; data2++; if (not2 == 0 && not1 == 0) { /* 1 AND 2 */ for (i = 0; i < n1; i++) { from1 = data1[i*2]; to1 = data1[i*2+1]; for (j = 0; j < n2; j++) { from2 = data2[j*2]; to2 = data2[j*2+1]; if (from2 > to1) break; if (to2 < from1) continue; from = MAX(from1, from2); to = MIN(to1, to2); r = add_code_range_to_buf(pbuf, from, to); if (r != 0) return r; } } } else if (not1 == 0) { /* 1 AND (not 2) */ for (i = 0; i < n1; i++) { from1 = data1[i*2]; to1 = data1[i*2+1]; r = and_code_range1(pbuf, from1, to1, data2, n2); if (r != 0) return r; } } return 0; } static int and_cclass(CClassNode* dest, CClassNode* cc, OnigEncoding enc) { int r, not1, not2; BBuf *buf1, *buf2, *pbuf; BitSetRef bsr1, bsr2; BitSet bs1, bs2; not1 = IS_NCCLASS_NOT(dest); bsr1 = dest->bs; buf1 = dest->mbuf; not2 = IS_NCCLASS_NOT(cc); bsr2 = cc->bs; buf2 = cc->mbuf; if (not1 != 0) { bitset_invert_to(bsr1, bs1); bsr1 = bs1; } if (not2 != 0) { bitset_invert_to(bsr2, bs2); bsr2 = bs2; } bitset_and(bsr1, bsr2); if (bsr1 != dest->bs) { bitset_copy(dest->bs, bsr1); } if (not1 != 0) { bitset_invert(dest->bs); } if (! ONIGENC_IS_SINGLEBYTE(enc)) { if (not1 != 0 && not2 != 0) { r = or_code_range_buf(enc, buf1, 0, buf2, 0, &pbuf); } else { r = and_code_range_buf(buf1, not1, buf2, not2, &pbuf); if (r == 0 && not1 != 0) { BBuf *tbuf; r = not_code_range_buf(enc, pbuf, &tbuf); if (r != 0) { bbuf_free(pbuf); return r; } bbuf_free(pbuf); pbuf = tbuf; } } if (r != 0) return r; dest->mbuf = pbuf; bbuf_free(buf1); return r; } return 0; } static int or_cclass(CClassNode* dest, CClassNode* cc, OnigEncoding enc) { int r, not1, not2; BBuf *buf1, *buf2, *pbuf; BitSetRef bsr1, bsr2; BitSet bs1, bs2; not1 = IS_NCCLASS_NOT(dest); bsr1 = dest->bs; buf1 = dest->mbuf; not2 = IS_NCCLASS_NOT(cc); bsr2 = cc->bs; buf2 = cc->mbuf; if (not1 != 0) { bitset_invert_to(bsr1, bs1); bsr1 = bs1; } if (not2 != 0) { bitset_invert_to(bsr2, bs2); bsr2 = bs2; } bitset_or(bsr1, bsr2); if (bsr1 != dest->bs) { bitset_copy(dest->bs, bsr1); } if (not1 != 0) { bitset_invert(dest->bs); } if (! ONIGENC_IS_SINGLEBYTE(enc)) { if (not1 != 0 && not2 != 0) { r = and_code_range_buf(buf1, 0, buf2, 0, &pbuf); } else { r = or_code_range_buf(enc, buf1, not1, buf2, not2, &pbuf); if (r == 0 && not1 != 0) { BBuf *tbuf; r = not_code_range_buf(enc, pbuf, &tbuf); if (r != 0) { bbuf_free(pbuf); return r; } bbuf_free(pbuf); pbuf = tbuf; } } if (r != 0) return r; dest->mbuf = pbuf; bbuf_free(buf1); return r; } else return 0; } static OnigCodePoint conv_backslash_value(OnigCodePoint c, ParseEnv* env) { if (IS_SYNTAX_OP(env->syntax, ONIG_SYN_OP_ESC_CONTROL_CHARS)) { switch (c) { case 'n': return '\n'; case 't': return '\t'; case 'r': return '\r'; case 'f': return '\f'; case 'a': return '\007'; case 'b': return '\010'; case 'e': return '\033'; case 'v': if (IS_SYNTAX_OP2(env->syntax, ONIG_SYN_OP2_ESC_V_VTAB)) return '\v'; break; default: break; } } return c; } static int is_invalid_quantifier_target(Node* node) { switch (NODE_TYPE(node)) { case NODE_ANCHOR: case NODE_GIMMICK: return 1; break; case NODE_BAG: /* allow enclosed elements */ /* return is_invalid_quantifier_target(NODE_BODY(node)); */ break; case NODE_LIST: do { if (! is_invalid_quantifier_target(NODE_CAR(node))) return 0; } while (IS_NOT_NULL(node = NODE_CDR(node))); return 0; break; case NODE_ALT: do { if (is_invalid_quantifier_target(NODE_CAR(node))) return 1; } while (IS_NOT_NULL(node = NODE_CDR(node))); break; default: break; } return 0; } /* ?:0, *:1, +:2, ??:3, *?:4, +?:5 */ static int quantifier_type_num(QuantNode* q) { if (q->greedy) { if (q->lower == 0) { if (q->upper == 1) return 0; else if (IS_INFINITE_REPEAT(q->upper)) return 1; } else if (q->lower == 1) { if (IS_INFINITE_REPEAT(q->upper)) return 2; } } else { if (q->lower == 0) { if (q->upper == 1) return 3; else if (IS_INFINITE_REPEAT(q->upper)) return 4; } else if (q->lower == 1) { if (IS_INFINITE_REPEAT(q->upper)) return 5; } } return -1; } enum ReduceType { RQ_ASIS = 0, /* as is */ RQ_DEL = 1, /* delete parent */ RQ_A, /* to '*' */ RQ_P, /* to '+' */ RQ_AQ, /* to '*?' */ RQ_QQ, /* to '??' */ RQ_P_QQ, /* to '+)??' */ }; static enum ReduceType ReduceTypeTable[6][6] = { {RQ_DEL, RQ_A, RQ_A, RQ_QQ, RQ_AQ, RQ_ASIS}, /* '?' */ {RQ_DEL, RQ_DEL, RQ_DEL, RQ_P_QQ, RQ_P_QQ, RQ_DEL}, /* '*' */ {RQ_A, RQ_A, RQ_DEL, RQ_ASIS, RQ_P_QQ, RQ_DEL}, /* '+' */ {RQ_DEL, RQ_AQ, RQ_AQ, RQ_DEL, RQ_AQ, RQ_AQ}, /* '??' */ {RQ_DEL, RQ_DEL, RQ_DEL, RQ_DEL, RQ_DEL, RQ_DEL}, /* '*?' */ {RQ_ASIS, RQ_A, RQ_P, RQ_AQ, RQ_AQ, RQ_DEL} /* '+?' */ }; extern int onig_reduce_nested_quantifier(Node* pnode) { int pnum, cnum; QuantNode *p, *c; Node* cnode; cnode = NODE_BODY(pnode); p = QUANT_(pnode); c = QUANT_(cnode); pnum = quantifier_type_num(p); cnum = quantifier_type_num(c); if (pnum < 0 || cnum < 0) { if (p->lower == p->upper && c->lower == c->upper) { int n = onig_positive_int_multiply(p->lower, c->lower); if (n < 0) return ONIGERR_TOO_BIG_NUMBER_FOR_REPEAT_RANGE; p->lower = p->upper = n; NODE_BODY(pnode) = NODE_BODY(cnode); goto remove_cnode; } return 0; } switch(ReduceTypeTable[cnum][pnum]) { case RQ_DEL: *pnode = *cnode; goto remove_cnode; break; case RQ_A: NODE_BODY(pnode) = NODE_BODY(cnode); p->lower = 0; p->upper = INFINITE_REPEAT; p->greedy = 1; goto remove_cnode; break; case RQ_P: NODE_BODY(pnode) = NODE_BODY(cnode); p->lower = 1; p->upper = INFINITE_REPEAT; p->greedy = 1; goto remove_cnode; break; case RQ_AQ: NODE_BODY(pnode) = NODE_BODY(cnode); p->lower = 0; p->upper = INFINITE_REPEAT; p->greedy = 0; goto remove_cnode; break; case RQ_QQ: NODE_BODY(pnode) = NODE_BODY(cnode); p->lower = 0; p->upper = 1; p->greedy = 0; goto remove_cnode; break; case RQ_P_QQ: p->lower = 0; p->upper = 1; p->greedy = 0; c->lower = 1; c->upper = INFINITE_REPEAT; c->greedy = 1; break; case RQ_ASIS: break; } return 0; remove_cnode: NODE_BODY(cnode) = NULL_NODE; onig_node_free(cnode); return 0; } static int node_new_general_newline(Node** node, ParseEnv* env) { int r; int dlen, alen; UChar buf[ONIGENC_CODE_TO_MBC_MAXLEN * 2]; Node* crnl; Node* ncc; Node* x; CClassNode* cc; dlen = ONIGENC_CODE_TO_MBC(env->enc, 0x0d, buf); if (dlen < 0) return dlen; alen = ONIGENC_CODE_TO_MBC(env->enc, NEWLINE_CODE, buf + dlen); if (alen < 0) return alen; crnl = node_new_str_crude(buf, buf + dlen + alen, ONIG_OPTION_NONE); CHECK_NULL_RETURN_MEMERR(crnl); ncc = node_new_cclass(); if (IS_NULL(ncc)) goto err2; cc = CCLASS_(ncc); if (dlen == 1) { bitset_set_range(cc->bs, NEWLINE_CODE, 0x0d); } else { r = add_code_range(&(cc->mbuf), env, NEWLINE_CODE, 0x0d); if (r != 0) { err1: onig_node_free(ncc); err2: onig_node_free(crnl); return ONIGERR_MEMORY; } } if (ONIGENC_IS_UNICODE_ENCODING(env->enc)) { r = add_code_range(&(cc->mbuf), env, 0x85, 0x85); if (r != 0) goto err1; r = add_code_range(&(cc->mbuf), env, 0x2028, 0x2029); if (r != 0) goto err1; } x = node_new_bag_if_else(crnl, NULL_NODE, ncc); if (IS_NULL(x)) goto err1; *node = x; return 0; } enum TokenSyms { TK_EOT = 0, /* end of token */ TK_CRUDE_BYTE = 1, TK_CHAR, TK_STRING, TK_CODE_POINT, TK_ANYCHAR, TK_CHAR_TYPE, TK_BACKREF, TK_CALL, TK_ANCHOR, TK_REPEAT, TK_INTERVAL, TK_ANYCHAR_ANYTIME, /* SQL '%' == .* */ TK_ALT, TK_SUBEXP_OPEN, TK_SUBEXP_CLOSE, TK_OPEN_CC, TK_QUOTE_OPEN, TK_CHAR_PROPERTY, /* \p{...}, \P{...} */ TK_KEEP, /* \K */ TK_GENERAL_NEWLINE, /* \R */ TK_NO_NEWLINE, /* \N */ TK_TRUE_ANYCHAR, /* \O */ TK_TEXT_SEGMENT, /* \X */ /* in cc */ TK_CC_CLOSE, TK_CC_RANGE, TK_CC_POSIX_BRACKET_OPEN, TK_CC_AND, /* && */ TK_CC_OPEN_CC /* [ */ }; typedef struct { enum TokenSyms type; int code_point_continue; int escaped; int base_num; /* is number: 8, 16 (used in [....]) */ UChar* backp; union { UChar* s; UChar byte; OnigCodePoint code; int anchor; int subtype; struct { int lower; int upper; int greedy; int possessive; } repeat; struct { int num; int ref1; int* refs; int by_name; #ifdef USE_BACKREF_WITH_LEVEL int exist_level; int level; /* \k */ #endif } backref; struct { UChar* name; UChar* name_end; int gnum; int by_number; } call; struct { int ctype; int not; } prop; } u; } PToken; static void ptoken_init(PToken* tok) { tok->code_point_continue = 0; } static int fetch_interval(UChar** src, UChar* end, PToken* tok, ParseEnv* env) { int low, up, syn_allow, non_low; int r; OnigCodePoint c; OnigEncoding enc; UChar* p; PFETCH_READY; p = *src; r = 0; non_low = 0; enc = env->enc; syn_allow = IS_SYNTAX_BV(env->syntax, ONIG_SYN_ALLOW_INVALID_INTERVAL); if (PEND) { if (syn_allow) return 1; /* "....{" : OK! */ else return ONIGERR_END_PATTERN_AT_LEFT_BRACE; /* "....{" syntax error */ } if (! syn_allow) { c = PPEEK; if (c == ')' || c == '(' || c == '|') { return ONIGERR_END_PATTERN_AT_LEFT_BRACE; } } low = scan_number(&p, end, env->enc); if (low < 0) return ONIGERR_TOO_BIG_NUMBER_FOR_REPEAT_RANGE; if (low > ONIG_MAX_REPEAT_NUM) return ONIGERR_TOO_BIG_NUMBER_FOR_REPEAT_RANGE; if (p == *src) { /* can't read low */ if (IS_SYNTAX_BV(env->syntax, ONIG_SYN_ALLOW_INTERVAL_LOW_ABBREV)) { /* allow {,n} as {0,n} */ low = 0; non_low = 1; } else goto invalid; } if (PEND) goto invalid; PFETCH(c); if (c == ',') { UChar* prev = p; up = scan_number(&p, end, env->enc); if (up < 0) return ONIGERR_TOO_BIG_NUMBER_FOR_REPEAT_RANGE; if (up > ONIG_MAX_REPEAT_NUM) return ONIGERR_TOO_BIG_NUMBER_FOR_REPEAT_RANGE; if (p == prev) { if (non_low != 0) goto invalid; up = INFINITE_REPEAT; /* {n,} : {n,infinite} */ } } else { if (non_low != 0) goto invalid; PUNFETCH; up = low; /* {n} : exact n times */ r = 2; /* fixed */ } if (PEND) goto invalid; PFETCH(c); if (IS_SYNTAX_OP(env->syntax, ONIG_SYN_OP_ESC_BRACE_INTERVAL)) { if (c != MC_ESC(env->syntax) || PEND) goto invalid; PFETCH(c); } if (c != '}') goto invalid; if (!IS_INFINITE_REPEAT(up) && low > up) { /* {n,m}+ supported case */ if (IS_SYNTAX_OP2(env->syntax, ONIG_SYN_OP2_PLUS_POSSESSIVE_INTERVAL)) return ONIGERR_UPPER_SMALLER_THAN_LOWER_IN_REPEAT_RANGE; tok->u.repeat.possessive = 1; { int tmp; tmp = low; low = up; up = tmp; } } else tok->u.repeat.possessive = 0; tok->type = TK_INTERVAL; tok->u.repeat.lower = low; tok->u.repeat.upper = up; *src = p; return r; /* 0: normal {n,m}, 2: fixed {n} */ invalid: if (syn_allow) { /* *src = p; */ /* !!! Don't do this line !!! */ return 1; /* OK */ } else return ONIGERR_INVALID_REPEAT_RANGE_PATTERN; } /* \M-, \C-, \c, or \... */ static int fetch_escaped_value_raw(UChar** src, UChar* end, ParseEnv* env, OnigCodePoint* val) { int v; OnigCodePoint c; OnigEncoding enc = env->enc; UChar* p = *src; if (PEND) return ONIGERR_END_PATTERN_AT_ESCAPE; PFETCH_S(c); switch (c) { case 'M': if (IS_SYNTAX_OP2(env->syntax, ONIG_SYN_OP2_ESC_CAPITAL_M_BAR_META)) { if (PEND) return ONIGERR_END_PATTERN_AT_META; PFETCH_S(c); if (c != '-') return ONIGERR_META_CODE_SYNTAX; if (PEND) return ONIGERR_END_PATTERN_AT_META; PFETCH_S(c); if (c == MC_ESC(env->syntax)) { v = fetch_escaped_value_raw(&p, end, env, &c); if (v < 0) return v; } c = ((c & 0xff) | 0x80); } else goto backslash; break; case 'C': if (IS_SYNTAX_OP2(env->syntax, ONIG_SYN_OP2_ESC_CAPITAL_C_BAR_CONTROL)) { if (PEND) return ONIGERR_END_PATTERN_AT_CONTROL; PFETCH_S(c); if (c != '-') return ONIGERR_CONTROL_CODE_SYNTAX; goto control; } else goto backslash; case 'c': if (IS_SYNTAX_OP(env->syntax, ONIG_SYN_OP_ESC_C_CONTROL)) { control: if (PEND) return ONIGERR_END_PATTERN_AT_CONTROL; PFETCH_S(c); if (c == '?') { c = 0177; } else { if (c == MC_ESC(env->syntax)) { v = fetch_escaped_value_raw(&p, end, env, &c); if (v < 0) return v; } c &= 0x9f; } break; } /* fall through */ default: { backslash: c = conv_backslash_value(c, env); } break; } *src = p; *val = c; return 0; } static int fetch_escaped_value(UChar** src, UChar* end, ParseEnv* env, OnigCodePoint* val) { int r; int len; r = fetch_escaped_value_raw(src, end, env, val); if (r != 0) return r; len = ONIGENC_CODE_TO_MBCLEN(env->enc, *val); if (len < 0) return len; return 0; } static int fetch_token(PToken* tok, UChar** src, UChar* end, ParseEnv* env); static OnigCodePoint get_name_end_code_point(OnigCodePoint start) { switch (start) { case '<': return (OnigCodePoint )'>'; break; case '\'': return (OnigCodePoint )'\''; break; case '(': return (OnigCodePoint )')'; break; default: break; } return (OnigCodePoint )0; } enum REF_NUM { IS_NOT_NUM = 0, IS_ABS_NUM = 1, IS_REL_NUM = 2 }; #ifdef USE_BACKREF_WITH_LEVEL /* \k, \k \k, \k \k<-num+n>, \k<-num-n> \k<+num+n>, \k<+num-n> */ static int fetch_name_with_level(OnigCodePoint start_code, UChar** src, UChar* end, UChar** rname_end, ParseEnv* env, int* rback_num, int* rlevel, enum REF_NUM* num_type) { int r, sign, exist_level; int digit_count; OnigCodePoint end_code; OnigCodePoint c; OnigEncoding enc; UChar *name_end; UChar *pnum_head; UChar *p; PFETCH_READY; p = *src; c = 0; enc = env->enc; *rback_num = 0; exist_level = 0; *num_type = IS_NOT_NUM; sign = 1; pnum_head = *src; end_code = get_name_end_code_point(start_code); digit_count = 0; name_end = end; r = 0; if (PEND) { return ONIGERR_EMPTY_GROUP_NAME; } else { PFETCH(c); if (c == end_code) return ONIGERR_EMPTY_GROUP_NAME; if (IS_CODE_DIGIT_ASCII(enc, c)) { *num_type = IS_ABS_NUM; digit_count++; } else if (c == '-') { *num_type = IS_REL_NUM; sign = -1; pnum_head = p; } else if (c == '+') { *num_type = IS_REL_NUM; sign = 1; pnum_head = p; } else if (!ONIGENC_IS_CODE_WORD(enc, c)) { r = ONIGERR_INVALID_CHAR_IN_GROUP_NAME; } } while (!PEND) { name_end = p; PFETCH(c); if (c == end_code || c == ')' || c == '+' || c == '-') { if (*num_type != IS_NOT_NUM && digit_count == 0) r = ONIGERR_INVALID_GROUP_NAME; break; } if (*num_type != IS_NOT_NUM) { if (IS_CODE_DIGIT_ASCII(enc, c)) { digit_count++; } else { r = ONIGERR_INVALID_GROUP_NAME; *num_type = IS_NOT_NUM; } } else if (!ONIGENC_IS_CODE_WORD(enc, c)) { r = ONIGERR_INVALID_CHAR_IN_GROUP_NAME; } } if (r == 0 && c != end_code) { if (c == '+' || c == '-') { int level; int flag = (c == '-' ? -1 : 1); if (PEND) { r = ONIGERR_INVALID_CHAR_IN_GROUP_NAME; goto end; } PFETCH(c); if (! IS_CODE_DIGIT_ASCII(enc, c)) goto err; PUNFETCH; level = scan_number(&p, end, enc); if (level < 0) return ONIGERR_TOO_BIG_NUMBER; *rlevel = (level * flag); exist_level = 1; if (!PEND) { PFETCH(c); if (c == end_code) goto end; } } err: name_end = end; err2: r = ONIGERR_INVALID_GROUP_NAME; } end: if (r == 0) { if (*num_type != IS_NOT_NUM) { *rback_num = scan_number(&pnum_head, name_end, enc); if (*rback_num < 0) return ONIGERR_TOO_BIG_NUMBER; else if (*rback_num == 0) { if (*num_type == IS_REL_NUM) goto err2; } *rback_num *= sign; } *rname_end = name_end; *src = p; return (exist_level ? 1 : 0); } else { onig_scan_env_set_error_string(env, r, *src, name_end); return r; } } #endif /* USE_BACKREF_WITH_LEVEL */ /* ref: 0 -> define name (don't allow number name) 1 -> reference name (allow number name) */ static int fetch_name(OnigCodePoint start_code, UChar** src, UChar* end, UChar** rname_end, ParseEnv* env, int* rback_num, enum REF_NUM* num_type, int is_ref) { int r, sign; int digit_count; OnigCodePoint end_code; OnigCodePoint c = 0; OnigEncoding enc = env->enc; UChar *name_end; UChar *pnum_head; UChar *p = *src; *rback_num = 0; end_code = get_name_end_code_point(start_code); digit_count = 0; name_end = end; pnum_head = *src; r = 0; *num_type = IS_NOT_NUM; sign = 1; if (PEND) { return ONIGERR_EMPTY_GROUP_NAME; } else { PFETCH_S(c); if (c == end_code) return ONIGERR_EMPTY_GROUP_NAME; if (IS_CODE_DIGIT_ASCII(enc, c)) { if (is_ref == TRUE) *num_type = IS_ABS_NUM; else { r = ONIGERR_INVALID_GROUP_NAME; } digit_count++; } else if (c == '-') { if (is_ref == TRUE) { *num_type = IS_REL_NUM; sign = -1; pnum_head = p; } else { r = ONIGERR_INVALID_GROUP_NAME; } } else if (c == '+') { if (is_ref == TRUE) { *num_type = IS_REL_NUM; sign = 1; pnum_head = p; } else { r = ONIGERR_INVALID_GROUP_NAME; } } else if (!ONIGENC_IS_CODE_WORD(enc, c)) { r = ONIGERR_INVALID_CHAR_IN_GROUP_NAME; } } if (r == 0) { while (!PEND) { name_end = p; PFETCH_S(c); if (c == end_code || c == ')') { if (*num_type != IS_NOT_NUM && digit_count == 0) r = ONIGERR_INVALID_GROUP_NAME; break; } if (*num_type != IS_NOT_NUM) { if (IS_CODE_DIGIT_ASCII(enc, c)) { digit_count++; } else { if (!ONIGENC_IS_CODE_WORD(enc, c)) r = ONIGERR_INVALID_CHAR_IN_GROUP_NAME; else r = ONIGERR_INVALID_GROUP_NAME; *num_type = IS_NOT_NUM; } } else { if (!ONIGENC_IS_CODE_WORD(enc, c)) { r = ONIGERR_INVALID_CHAR_IN_GROUP_NAME; } } } if (c != end_code) { r = ONIGERR_INVALID_GROUP_NAME; goto err; } if (*num_type != IS_NOT_NUM) { *rback_num = scan_number(&pnum_head, name_end, enc); if (*rback_num < 0) return ONIGERR_TOO_BIG_NUMBER; else if (*rback_num == 0) { if (*num_type == IS_REL_NUM) { r = ONIGERR_INVALID_GROUP_NAME; goto err; } } *rback_num *= sign; } *rname_end = name_end; *src = p; return 0; } else { while (!PEND) { name_end = p; PFETCH_S(c); if (c == end_code || c == ')') break; } if (PEND) name_end = end; err: onig_scan_env_set_error_string(env, r, *src, name_end); return r; } } static void CC_ESC_WARN(ParseEnv* env, UChar *c) { if (onig_warn == onig_null_warn) return ; if (IS_SYNTAX_BV(env->syntax, ONIG_SYN_WARN_CC_OP_NOT_ESCAPED) && IS_SYNTAX_BV(env->syntax, ONIG_SYN_BACKSLASH_ESCAPE_IN_CC)) { UChar buf[WARN_BUFSIZE]; onig_snprintf_with_pattern(buf, WARN_BUFSIZE, env->enc, env->pattern, env->pattern_end, (UChar* )"character class has '%s' without escape", c); (*onig_warn)((char* )buf); } } static void CLOSE_BRACKET_WITHOUT_ESC_WARN(ParseEnv* env, UChar* c) { if (onig_warn == onig_null_warn) return ; if (IS_SYNTAX_BV((env)->syntax, ONIG_SYN_WARN_CC_OP_NOT_ESCAPED)) { UChar buf[WARN_BUFSIZE]; onig_snprintf_with_pattern(buf, WARN_BUFSIZE, (env)->enc, (env)->pattern, (env)->pattern_end, (UChar* )"regular expression has '%s' without escape", c); (*onig_warn)((char* )buf); } } static UChar* find_str_position(OnigCodePoint s[], int n, UChar* from, UChar* to, UChar **next, OnigEncoding enc) { int i; OnigCodePoint x; UChar *q; UChar *p = from; while (p < to) { x = ONIGENC_MBC_TO_CODE(enc, p, to); q = p + enclen(enc, p); if (x == s[0]) { for (i = 1; i < n && q < to; i++) { x = ONIGENC_MBC_TO_CODE(enc, q, to); if (x != s[i]) break; q += enclen(enc, q); } if (i >= n) { if (IS_NOT_NULL(next)) *next = q; return p; } } p = q; } return NULL_UCHARP; } static int str_exist_check_with_esc(OnigCodePoint s[], int n, UChar* from, UChar* to, OnigCodePoint bad, OnigEncoding enc, OnigSyntaxType* syn) { int i, in_esc; OnigCodePoint x; UChar *q; UChar *p = from; in_esc = 0; while (p < to) { if (in_esc) { in_esc = 0; p += enclen(enc, p); } else { x = ONIGENC_MBC_TO_CODE(enc, p, to); q = p + enclen(enc, p); if (x == s[0]) { for (i = 1; i < n && q < to; i++) { x = ONIGENC_MBC_TO_CODE(enc, q, to); if (x != s[i]) break; q += enclen(enc, q); } if (i >= n) return 1; p += enclen(enc, p); } else { x = ONIGENC_MBC_TO_CODE(enc, p, to); if (x == bad) return 0; else if (x == MC_ESC(syn)) in_esc = 1; p = q; } } } return 0; } static int fetch_token_cc(PToken* tok, UChar** src, UChar* end, ParseEnv* env, int state) { int r; OnigCodePoint code; OnigCodePoint c, c2; int mindigits, maxdigits; OnigSyntaxType* syn; OnigEncoding enc; UChar* prev; UChar* p; PFETCH_READY; p = *src; enc = env->enc; syn = env->syntax; if (tok->code_point_continue != 0) { r = get_next_code_point(&p, end, tok->base_num, enc, TRUE, &code); if (r == 1) { tok->code_point_continue = 0; } else if (r == 2) { tok->type = TK_CC_RANGE; goto end; } else if (r == 0) { tok->type = TK_CODE_POINT; tok->u.code = code; goto end; } else return r; /* error */ } if (PEND) { tok->type = TK_EOT; return tok->type; } PFETCH(c); tok->type = TK_CHAR; tok->base_num = 0; tok->u.code = c; tok->escaped = 0; if (c == ']') { tok->type = TK_CC_CLOSE; } else if (c == '-') { tok->type = TK_CC_RANGE; } else if (c == MC_ESC(syn)) { if (! IS_SYNTAX_BV(syn, ONIG_SYN_BACKSLASH_ESCAPE_IN_CC)) goto end; if (PEND) return ONIGERR_END_PATTERN_AT_ESCAPE; PFETCH(c); tok->escaped = 1; tok->u.code = c; switch (c) { case 'w': tok->type = TK_CHAR_TYPE; tok->u.prop.ctype = ONIGENC_CTYPE_WORD; tok->u.prop.not = 0; break; case 'W': tok->type = TK_CHAR_TYPE; tok->u.prop.ctype = ONIGENC_CTYPE_WORD; tok->u.prop.not = 1; break; case 'd': tok->type = TK_CHAR_TYPE; tok->u.prop.ctype = ONIGENC_CTYPE_DIGIT; tok->u.prop.not = 0; break; case 'D': tok->type = TK_CHAR_TYPE; tok->u.prop.ctype = ONIGENC_CTYPE_DIGIT; tok->u.prop.not = 1; break; case 's': tok->type = TK_CHAR_TYPE; tok->u.prop.ctype = ONIGENC_CTYPE_SPACE; tok->u.prop.not = 0; break; case 'S': tok->type = TK_CHAR_TYPE; tok->u.prop.ctype = ONIGENC_CTYPE_SPACE; tok->u.prop.not = 1; break; case 'h': if (! IS_SYNTAX_OP2(syn, ONIG_SYN_OP2_ESC_H_XDIGIT)) break; tok->type = TK_CHAR_TYPE; tok->u.prop.ctype = ONIGENC_CTYPE_XDIGIT; tok->u.prop.not = 0; break; case 'H': if (! IS_SYNTAX_OP2(syn, ONIG_SYN_OP2_ESC_H_XDIGIT)) break; tok->type = TK_CHAR_TYPE; tok->u.prop.ctype = ONIGENC_CTYPE_XDIGIT; tok->u.prop.not = 1; break; case 'p': case 'P': if (PEND) break; c2 = PPEEK; if (c2 == '{' && IS_SYNTAX_OP2(syn, ONIG_SYN_OP2_ESC_P_BRACE_CHAR_PROPERTY)) { PINC; tok->type = TK_CHAR_PROPERTY; tok->u.prop.not = c == 'P'; if (!PEND && IS_SYNTAX_OP2(syn, ONIG_SYN_OP2_ESC_P_BRACE_CIRCUMFLEX_NOT)) { PFETCH(c2); if (c2 == '^') { tok->u.prop.not = tok->u.prop.not == 0; } else PUNFETCH; } } break; case 'o': if (PEND) break; prev = p; if (PPEEK_IS('{') && IS_SYNTAX_OP(syn, ONIG_SYN_OP_ESC_O_BRACE_OCTAL)) { PINC; r = scan_octal_number(&p, end, 0, 11, enc, &code); if (r < 0) return r; if (!PEND) { c2 = PPEEK; if (IS_CODE_DIGIT_ASCII(enc, c2)) return ONIGERR_TOO_LONG_WIDE_CHAR_VALUE; } tok->base_num = 8; goto brace_code_point_entry; } break; case 'x': if (PEND) break; prev = p; if (PPEEK_IS('{') && IS_SYNTAX_OP(syn, ONIG_SYN_OP_ESC_X_BRACE_HEX8)) { PINC; r = scan_hexadecimal_number(&p, end, 0, 8, enc, &code); if (r < 0) return r; if (!PEND) { c2 = PPEEK; if (IS_CODE_XDIGIT_ASCII(enc, c2)) return ONIGERR_TOO_LONG_WIDE_CHAR_VALUE; } tok->base_num = 16; brace_code_point_entry: if ((p > prev + enclen(enc, prev))) { if (PEND) return ONIGERR_INVALID_CODE_POINT_VALUE; if (PPEEK_IS('}')) { PINC; } else { int curr_state; curr_state = (state == CS_RANGE) ? CPS_EMPTY : CPS_START; r = check_code_point_sequence_cc(p, end, tok->base_num, enc, curr_state); if (r < 0) return r; if (r == 0) return ONIGERR_INVALID_CODE_POINT_VALUE; tok->code_point_continue = TRUE; } tok->type = TK_CODE_POINT; tok->u.code = code; } else { /* can't read nothing or invalid format */ p = prev; } } else if (IS_SYNTAX_OP(syn, ONIG_SYN_OP_ESC_X_HEX2)) { r = scan_hexadecimal_number(&p, end, 0, 2, enc, &code); if (r < 0) return r; if (p == prev) { /* can't read nothing. */ code = 0; /* but, it's not error */ } tok->type = TK_CRUDE_BYTE; tok->base_num = 16; tok->u.byte = (UChar )code; } break; case 'u': if (PEND) break; prev = p; if (IS_SYNTAX_OP2(syn, ONIG_SYN_OP2_ESC_U_HEX4)) { mindigits = maxdigits = 4; u_hex_digits: r = scan_hexadecimal_number(&p, end, mindigits, maxdigits, enc, &code); if (r < 0) return r; if (p == prev) { /* can't read nothing. */ code = 0; /* but, it's not error */ } tok->type = TK_CODE_POINT; tok->base_num = 16; tok->u.code = code; } break; case 'U': if (PEND) break; prev = p; if (IS_SYNTAX_BV(syn, ONIG_SYN_PYTHON)) { mindigits = maxdigits = 8; goto u_hex_digits; } break; case '0': case '1': case '2': case '3': case '4': case '5': case '6': case '7': if (IS_SYNTAX_OP(syn, ONIG_SYN_OP_ESC_OCTAL3)) { PUNFETCH; prev = p; r = scan_octal_number(&p, end, 0, 3, enc, &code); if (r < 0) return r; if (code >= 256) return ONIGERR_TOO_BIG_NUMBER; if (p == prev) { /* can't read nothing. */ code = 0; /* but, it's not error */ } tok->type = TK_CRUDE_BYTE; tok->base_num = 8; tok->u.byte = (UChar )code; } break; default: PUNFETCH; r = fetch_escaped_value(&p, end, env, &c2); if (r < 0) return r; if (tok->u.code != c2) { tok->u.code = c2; tok->type = TK_CODE_POINT; } break; } } else if (c == '[') { if (IS_SYNTAX_OP(syn, ONIG_SYN_OP_POSIX_BRACKET) && (PPEEK_IS(':'))) { OnigCodePoint send[] = { (OnigCodePoint )':', (OnigCodePoint )']' }; tok->backp = p; /* point at '[' is read */ PINC; if (str_exist_check_with_esc(send, 2, p, end, (OnigCodePoint )']', enc, syn)) { tok->type = TK_CC_POSIX_BRACKET_OPEN; } else { PUNFETCH; goto cc_in_cc; } } else { cc_in_cc: if (IS_SYNTAX_OP2(syn, ONIG_SYN_OP2_CCLASS_SET_OP)) { tok->type = TK_CC_OPEN_CC; } else { CC_ESC_WARN(env, (UChar* )"["); } } } else if (c == '&') { if (IS_SYNTAX_OP2(syn, ONIG_SYN_OP2_CCLASS_SET_OP) && !PEND && (PPEEK_IS('&'))) { PINC; tok->type = TK_CC_AND; } } end: *src = p; return tok->type; } static int fetch_token(PToken* tok, UChar** src, UChar* end, ParseEnv* env) { int r; OnigCodePoint code; OnigCodePoint c; int mindigits, maxdigits; UChar* prev; int allow_num; OnigEncoding enc; OnigSyntaxType* syn; UChar* p; PFETCH_READY; enc = env->enc; syn = env->syntax; p = *src; if (tok->code_point_continue != 0) { r = get_next_code_point(&p, end, tok->base_num, enc, FALSE, &code); if (r == 1) { tok->code_point_continue = 0; } else if (r == 0) { tok->type = TK_CODE_POINT; tok->u.code = code; goto out; } else return r; /* error */ } start: if (PEND) { tok->type = TK_EOT; return tok->type; } tok->type = TK_STRING; tok->base_num = 0; tok->backp = p; PFETCH(c); if (IS_MC_ESC_CODE(c, syn)) { if (PEND) return ONIGERR_END_PATTERN_AT_ESCAPE; tok->backp = p; PFETCH(c); tok->u.code = c; tok->escaped = 1; switch (c) { case '*': if (! IS_SYNTAX_OP(syn, ONIG_SYN_OP_ESC_ASTERISK_ZERO_INF)) break; tok->type = TK_REPEAT; tok->u.repeat.lower = 0; tok->u.repeat.upper = INFINITE_REPEAT; goto greedy_check; break; case '+': if (! IS_SYNTAX_OP(syn, ONIG_SYN_OP_ESC_PLUS_ONE_INF)) break; tok->type = TK_REPEAT; tok->u.repeat.lower = 1; tok->u.repeat.upper = INFINITE_REPEAT; goto greedy_check; break; case '?': if (! IS_SYNTAX_OP(syn, ONIG_SYN_OP_ESC_QMARK_ZERO_ONE)) break; tok->type = TK_REPEAT; tok->u.repeat.lower = 0; tok->u.repeat.upper = 1; greedy_check: tok->u.repeat.possessive = 0; greedy_check2: if (!PEND && PPEEK_IS('?') && IS_SYNTAX_OP(syn, ONIG_SYN_OP_QMARK_NON_GREEDY) && tok->u.repeat.possessive == 0) { PFETCH(c); tok->u.repeat.greedy = 0; tok->u.repeat.possessive = 0; } else { possessive_check: tok->u.repeat.greedy = 1; if (!PEND && PPEEK_IS('+') && ((IS_SYNTAX_OP2(syn, ONIG_SYN_OP2_PLUS_POSSESSIVE_REPEAT) && tok->type != TK_INTERVAL) || (IS_SYNTAX_OP2(syn, ONIG_SYN_OP2_PLUS_POSSESSIVE_INTERVAL) && tok->type == TK_INTERVAL)) && tok->u.repeat.possessive == 0) { PFETCH(c); tok->u.repeat.possessive = 1; } } break; case '{': if (! IS_SYNTAX_OP(syn, ONIG_SYN_OP_ESC_BRACE_INTERVAL)) break; r = fetch_interval(&p, end, tok, env); if (r < 0) return r; /* error */ if (r == 0) goto greedy_check2; else if (r == 2) { /* {n} */ if (IS_SYNTAX_BV(syn, ONIG_SYN_FIXED_INTERVAL_IS_GREEDY_ONLY)) goto possessive_check; goto greedy_check2; } /* r == 1 : normal char */ break; case '|': if (! IS_SYNTAX_OP(syn, ONIG_SYN_OP_ESC_VBAR_ALT)) break; tok->type = TK_ALT; break; case '(': if (! IS_SYNTAX_OP(syn, ONIG_SYN_OP_ESC_LPAREN_SUBEXP)) break; tok->type = TK_SUBEXP_OPEN; break; case ')': if (! IS_SYNTAX_OP(syn, ONIG_SYN_OP_ESC_LPAREN_SUBEXP)) break; tok->type = TK_SUBEXP_CLOSE; break; case 'w': if (! IS_SYNTAX_OP(syn, ONIG_SYN_OP_ESC_W_WORD)) break; tok->type = TK_CHAR_TYPE; tok->u.prop.ctype = ONIGENC_CTYPE_WORD; tok->u.prop.not = 0; break; case 'W': if (! IS_SYNTAX_OP(syn, ONIG_SYN_OP_ESC_W_WORD)) break; tok->type = TK_CHAR_TYPE; tok->u.prop.ctype = ONIGENC_CTYPE_WORD; tok->u.prop.not = 1; break; case 'b': if (! IS_SYNTAX_OP(syn, ONIG_SYN_OP_ESC_B_WORD_BOUND)) break; tok->type = TK_ANCHOR; tok->u.anchor = ANCR_WORD_BOUNDARY; break; case 'B': if (! IS_SYNTAX_OP(syn, ONIG_SYN_OP_ESC_B_WORD_BOUND)) break; tok->type = TK_ANCHOR; tok->u.anchor = ANCR_NO_WORD_BOUNDARY; break; case 'y': if (! IS_SYNTAX_OP(syn, ONIG_SYN_OP2_ESC_X_Y_TEXT_SEGMENT)) break; tok->type = TK_ANCHOR; tok->u.anchor = ANCR_TEXT_SEGMENT_BOUNDARY; break; case 'Y': if (! IS_SYNTAX_OP(syn, ONIG_SYN_OP2_ESC_X_Y_TEXT_SEGMENT)) break; tok->type = TK_ANCHOR; tok->u.anchor = ANCR_NO_TEXT_SEGMENT_BOUNDARY; break; #ifdef USE_WORD_BEGIN_END case '<': if (! IS_SYNTAX_OP(syn, ONIG_SYN_OP_ESC_LTGT_WORD_BEGIN_END)) break; tok->type = TK_ANCHOR; tok->u.anchor = ANCR_WORD_BEGIN; break; case '>': if (! IS_SYNTAX_OP(syn, ONIG_SYN_OP_ESC_LTGT_WORD_BEGIN_END)) break; tok->type = TK_ANCHOR; tok->u.anchor = ANCR_WORD_END; break; #endif case 's': if (! IS_SYNTAX_OP(syn, ONIG_SYN_OP_ESC_S_WHITE_SPACE)) break; tok->type = TK_CHAR_TYPE; tok->u.prop.ctype = ONIGENC_CTYPE_SPACE; tok->u.prop.not = 0; break; case 'S': if (! IS_SYNTAX_OP(syn, ONIG_SYN_OP_ESC_S_WHITE_SPACE)) break; tok->type = TK_CHAR_TYPE; tok->u.prop.ctype = ONIGENC_CTYPE_SPACE; tok->u.prop.not = 1; break; case 'd': if (! IS_SYNTAX_OP(syn, ONIG_SYN_OP_ESC_D_DIGIT)) break; tok->type = TK_CHAR_TYPE; tok->u.prop.ctype = ONIGENC_CTYPE_DIGIT; tok->u.prop.not = 0; break; case 'D': if (! IS_SYNTAX_OP(syn, ONIG_SYN_OP_ESC_D_DIGIT)) break; tok->type = TK_CHAR_TYPE; tok->u.prop.ctype = ONIGENC_CTYPE_DIGIT; tok->u.prop.not = 1; break; case 'h': if (! IS_SYNTAX_OP2(syn, ONIG_SYN_OP2_ESC_H_XDIGIT)) break; tok->type = TK_CHAR_TYPE; tok->u.prop.ctype = ONIGENC_CTYPE_XDIGIT; tok->u.prop.not = 0; break; case 'H': if (! IS_SYNTAX_OP2(syn, ONIG_SYN_OP2_ESC_H_XDIGIT)) break; tok->type = TK_CHAR_TYPE; tok->u.prop.ctype = ONIGENC_CTYPE_XDIGIT; tok->u.prop.not = 1; break; case 'K': if (! IS_SYNTAX_OP2(syn, ONIG_SYN_OP2_ESC_CAPITAL_K_KEEP)) break; tok->type = TK_KEEP; break; case 'R': if (! IS_SYNTAX_OP2(syn, ONIG_SYN_OP2_ESC_CAPITAL_R_GENERAL_NEWLINE)) break; tok->type = TK_GENERAL_NEWLINE; break; case 'N': if (! IS_SYNTAX_OP2(syn, ONIG_SYN_OP2_ESC_CAPITAL_N_O_SUPER_DOT)) break; tok->type = TK_NO_NEWLINE; break; case 'O': if (! IS_SYNTAX_OP2(syn, ONIG_SYN_OP2_ESC_CAPITAL_N_O_SUPER_DOT)) break; tok->type = TK_TRUE_ANYCHAR; break; case 'X': if (! IS_SYNTAX_OP2(syn, ONIG_SYN_OP2_ESC_X_Y_TEXT_SEGMENT)) break; tok->type = TK_TEXT_SEGMENT; break; case 'A': if (! IS_SYNTAX_OP(syn, ONIG_SYN_OP_ESC_AZ_BUF_ANCHOR)) break; begin_buf: tok->type = TK_ANCHOR; tok->u.subtype = ANCR_BEGIN_BUF; break; case 'Z': if (IS_SYNTAX_BV(syn, ONIG_SYN_PYTHON)) { goto end_buf; } else { if (! IS_SYNTAX_OP(syn, ONIG_SYN_OP_ESC_AZ_BUF_ANCHOR)) break; tok->type = TK_ANCHOR; tok->u.subtype = ANCR_SEMI_END_BUF; } break; case 'z': if (IS_SYNTAX_BV(syn, ONIG_SYN_PYTHON)) return ONIGERR_UNDEFINED_OPERATOR; if (! IS_SYNTAX_OP(syn, ONIG_SYN_OP_ESC_AZ_BUF_ANCHOR)) break; end_buf: tok->type = TK_ANCHOR; tok->u.subtype = ANCR_END_BUF; break; case 'G': if (! IS_SYNTAX_OP(syn, ONIG_SYN_OP_ESC_CAPITAL_G_BEGIN_ANCHOR)) break; tok->type = TK_ANCHOR; tok->u.subtype = ANCR_BEGIN_POSITION; break; case '`': if (! IS_SYNTAX_OP2(syn, ONIG_SYN_OP2_ESC_GNU_BUF_ANCHOR)) break; goto begin_buf; break; case '\'': if (! IS_SYNTAX_OP2(syn, ONIG_SYN_OP2_ESC_GNU_BUF_ANCHOR)) break; goto end_buf; break; case 'o': if (PEND) break; prev = p; if (PPEEK_IS('{') && IS_SYNTAX_OP(syn, ONIG_SYN_OP_ESC_O_BRACE_OCTAL)) { PINC; r = scan_octal_number(&p, end, 0, 11, enc, &code); if (r < 0) return r; if (!PEND) { if (IS_CODE_DIGIT_ASCII(enc, PPEEK)) return ONIGERR_TOO_LONG_WIDE_CHAR_VALUE; } tok->base_num = 8; goto brace_code_point_entry; } break; case 'x': if (PEND) break; prev = p; if (PPEEK_IS('{') && IS_SYNTAX_OP(syn, ONIG_SYN_OP_ESC_X_BRACE_HEX8)) { PINC; r = scan_hexadecimal_number(&p, end, 0, 8, enc, &code); if (r < 0) return r; if (!PEND) { if (IS_CODE_XDIGIT_ASCII(enc, PPEEK)) return ONIGERR_TOO_LONG_WIDE_CHAR_VALUE; } tok->base_num = 16; brace_code_point_entry: if ((p > prev + enclen(enc, prev))) { if (PEND) return ONIGERR_INVALID_CODE_POINT_VALUE; if (PPEEK_IS('}')) { PINC; } else { r = check_code_point_sequence(p, end, tok->base_num, enc); if (r < 0) return r; if (r == 0) return ONIGERR_INVALID_CODE_POINT_VALUE; tok->code_point_continue = TRUE; } tok->type = TK_CODE_POINT; tok->u.code = code; } else { /* can't read nothing or invalid format */ p = prev; } } else if (IS_SYNTAX_OP(syn, ONIG_SYN_OP_ESC_X_HEX2)) { r = scan_hexadecimal_number(&p, end, 0, 2, enc, &code); if (r < 0) return r; if (p == prev) { /* can't read nothing. */ code = 0; /* but, it's not error */ } tok->type = TK_CRUDE_BYTE; tok->base_num = 16; tok->u.byte = (UChar )code; } break; case 'u': if (PEND) break; prev = p; mindigits = maxdigits = 4; if (IS_SYNTAX_OP2(syn, ONIG_SYN_OP2_ESC_U_HEX4)) { u_hex_digits: r = scan_hexadecimal_number(&p, end, mindigits, maxdigits, enc, &code); if (r < 0) return r; if (p == prev) { /* can't read nothing. */ code = 0; /* but, it's not error */ } tok->type = TK_CODE_POINT; tok->base_num = 16; tok->u.code = code; } break; case 'U': if (PEND) break; prev = p; if (IS_SYNTAX_BV(syn, ONIG_SYN_PYTHON)) { mindigits = maxdigits = 8; goto u_hex_digits; } break; case '1': case '2': case '3': case '4': case '5': case '6': case '7': case '8': case '9': PUNFETCH; prev = p; r = scan_number(&p, end, enc); if (r < 0 || r > ONIG_MAX_BACKREF_NUM) { goto skip_backref; } if (IS_SYNTAX_OP(syn, ONIG_SYN_OP_DECIMAL_BACKREF) && (r <= env->num_mem || r <= 9)) { /* This spec. from GNU regex */ if (IS_SYNTAX_BV(syn, ONIG_SYN_STRICT_CHECK_BACKREF)) { if (r > env->num_mem || IS_NULL(PARSEENV_MEMENV(env)[r].mem_node)) return ONIGERR_INVALID_BACKREF; } tok->type = TK_BACKREF; tok->u.backref.num = 1; tok->u.backref.ref1 = r; tok->u.backref.by_name = 0; #ifdef USE_BACKREF_WITH_LEVEL tok->u.backref.exist_level = 0; #endif break; } skip_backref: if (c == '8' || c == '9') { /* normal char */ p = prev; PINC; break; } p = prev; /* fall through */ case '0': if (IS_SYNTAX_OP(syn, ONIG_SYN_OP_ESC_OCTAL3)) { prev = p; r = scan_octal_number(&p, end, 0, (c == '0' ? 2:3), enc, &code); if (r < 0 || r >= 256) return ONIGERR_TOO_BIG_NUMBER; if (p == prev) { /* can't read nothing. */ code = 0; /* but, it's not error */ } tok->type = TK_CRUDE_BYTE; tok->base_num = 8; tok->u.byte = (UChar )code; } else if (c != '0') { PINC; } break; case 'k': if (!PEND && IS_SYNTAX_OP2(syn, ONIG_SYN_OP2_ESC_K_NAMED_BACKREF)) { PFETCH(c); if (c == '<' || c == '\'') { UChar* name_end; int* backs; int back_num; enum REF_NUM num_type; allow_num = 1; backref_start: prev = p; #ifdef USE_BACKREF_WITH_LEVEL name_end = NULL_UCHARP; /* no need. escape gcc warning. */ r = fetch_name_with_level((OnigCodePoint )c, &p, end, &name_end, env, &back_num, &tok->u.backref.level, &num_type); if (r == 1) tok->u.backref.exist_level = 1; else tok->u.backref.exist_level = 0; #else r = fetch_name(c, &p, end, &name_end, env, &back_num, &num_type, TRUE); #endif if (r < 0) return r; if (num_type != IS_NOT_NUM) { if (allow_num == 0) return ONIGERR_INVALID_BACKREF; if (num_type == IS_REL_NUM) { back_num = backref_rel_to_abs(back_num, env); } if (back_num <= 0) return ONIGERR_INVALID_BACKREF; if (IS_SYNTAX_BV(syn, ONIG_SYN_STRICT_CHECK_BACKREF)) { if (back_num > env->num_mem || IS_NULL(PARSEENV_MEMENV(env)[back_num].mem_node)) return ONIGERR_INVALID_BACKREF; } tok->type = TK_BACKREF; tok->u.backref.by_name = 0; tok->u.backref.num = 1; tok->u.backref.ref1 = back_num; } else { int num = name_to_group_numbers(env, prev, name_end, &backs); if (num <= 0) { return ONIGERR_UNDEFINED_NAME_REFERENCE; } if (IS_SYNTAX_BV(syn, ONIG_SYN_STRICT_CHECK_BACKREF)) { int i; for (i = 0; i < num; i++) { if (backs[i] > env->num_mem || IS_NULL(PARSEENV_MEMENV(env)[backs[i]].mem_node)) return ONIGERR_INVALID_BACKREF; } } tok->type = TK_BACKREF; tok->u.backref.by_name = 1; if (num == 1) { tok->u.backref.num = 1; tok->u.backref.ref1 = backs[0]; } else { tok->u.backref.num = num; tok->u.backref.refs = backs; } } } else PUNFETCH; } break; #ifdef USE_CALL case 'g': if (!PEND && IS_SYNTAX_OP2(syn, ONIG_SYN_OP2_ESC_G_SUBEXP_CALL)) { PFETCH(c); if (c == '<' || c == '\'') { int gnum; UChar* name_end; enum REF_NUM num_type; allow_num = 1; call_start: prev = p; r = fetch_name((OnigCodePoint )c, &p, end, &name_end, env, &gnum, &num_type, TRUE); if (r < 0) return r; if (num_type != IS_NOT_NUM) { if (allow_num == 0) return ONIGERR_UNDEFINED_GROUP_REFERENCE; if (num_type == IS_REL_NUM) { gnum = backref_rel_to_abs(gnum, env); if (gnum < 0) { onig_scan_env_set_error_string(env, ONIGERR_UNDEFINED_NAME_REFERENCE, prev, name_end); return ONIGERR_UNDEFINED_GROUP_REFERENCE; } } tok->u.call.by_number = 1; tok->u.call.gnum = gnum; } else { tok->u.call.by_number = 0; tok->u.call.gnum = 0; } tok->type = TK_CALL; tok->u.call.name = prev; tok->u.call.name_end = name_end; } else PUNFETCH; } break; #endif case 'Q': if (IS_SYNTAX_OP2(syn, ONIG_SYN_OP2_ESC_CAPITAL_Q_QUOTE)) { tok->type = TK_QUOTE_OPEN; } break; case 'p': case 'P': if (!PEND && PPEEK_IS('{') && IS_SYNTAX_OP2(syn, ONIG_SYN_OP2_ESC_P_BRACE_CHAR_PROPERTY)) { PINC; tok->type = TK_CHAR_PROPERTY; tok->u.prop.not = c == 'P'; if (!PEND && IS_SYNTAX_OP2(syn, ONIG_SYN_OP2_ESC_P_BRACE_CIRCUMFLEX_NOT)) { PFETCH(c); if (c == '^') { tok->u.prop.not = tok->u.prop.not == 0; } else PUNFETCH; } } break; default: { OnigCodePoint c2; PUNFETCH; r = fetch_escaped_value(&p, end, env, &c2); if (r < 0) return r; if (tok->u.code != c2) { tok->type = TK_CODE_POINT; tok->u.code = c2; } else { /* string */ p = tok->backp + enclen(enc, tok->backp); } } break; } } else { tok->u.code = c; tok->escaped = 0; #ifdef USE_VARIABLE_META_CHARS if ((c != ONIG_INEFFECTIVE_META_CHAR) && IS_SYNTAX_OP(syn, ONIG_SYN_OP_VARIABLE_META_CHARACTERS)) { if (c == MC_ANYCHAR(syn)) goto any_char; else if (c == MC_ANYTIME(syn)) goto any_time; else if (c == MC_ZERO_OR_ONE_TIME(syn)) goto zero_or_one_time; else if (c == MC_ONE_OR_MORE_TIME(syn)) goto one_or_more_time; else if (c == MC_ANYCHAR_ANYTIME(syn)) { tok->type = TK_ANYCHAR_ANYTIME; goto out; } } #endif switch (c) { case '.': if (! IS_SYNTAX_OP(syn, ONIG_SYN_OP_DOT_ANYCHAR)) break; #ifdef USE_VARIABLE_META_CHARS any_char: #endif tok->type = TK_ANYCHAR; break; case '*': if (! IS_SYNTAX_OP(syn, ONIG_SYN_OP_ASTERISK_ZERO_INF)) break; #ifdef USE_VARIABLE_META_CHARS any_time: #endif tok->type = TK_REPEAT; tok->u.repeat.lower = 0; tok->u.repeat.upper = INFINITE_REPEAT; goto greedy_check; break; case '+': if (! IS_SYNTAX_OP(syn, ONIG_SYN_OP_PLUS_ONE_INF)) break; #ifdef USE_VARIABLE_META_CHARS one_or_more_time: #endif tok->type = TK_REPEAT; tok->u.repeat.lower = 1; tok->u.repeat.upper = INFINITE_REPEAT; goto greedy_check; break; case '?': if (! IS_SYNTAX_OP(syn, ONIG_SYN_OP_QMARK_ZERO_ONE)) break; #ifdef USE_VARIABLE_META_CHARS zero_or_one_time: #endif tok->type = TK_REPEAT; tok->u.repeat.lower = 0; tok->u.repeat.upper = 1; goto greedy_check; break; case '{': if (! IS_SYNTAX_OP(syn, ONIG_SYN_OP_BRACE_INTERVAL)) break; r = fetch_interval(&p, end, tok, env); if (r < 0) return r; /* error */ if (r == 0) goto greedy_check2; else if (r == 2) { /* {n} */ if (IS_SYNTAX_BV(syn, ONIG_SYN_FIXED_INTERVAL_IS_GREEDY_ONLY)) goto possessive_check; goto greedy_check2; } /* r == 1 : normal char */ break; case '|': if (! IS_SYNTAX_OP(syn, ONIG_SYN_OP_VBAR_ALT)) break; tok->type = TK_ALT; break; case '(': if (!PEND && PPEEK_IS('?') && IS_SYNTAX_OP2(syn, ONIG_SYN_OP2_QMARK_GROUP_EFFECT)) { prev = p; PINC; if (! PEND) { c = PPEEK; if (c == '#') { PFETCH(c); while (1) { if (PEND) return ONIGERR_END_PATTERN_IN_GROUP; PFETCH(c); if (c == MC_ESC(syn)) { if (! PEND) PFETCH(c); } else { if (c == ')') break; } } goto start; } else if (IS_SYNTAX_OP2(syn, ONIG_SYN_OP2_QMARK_PERL_SUBEXP_CALL)) { int gnum; UChar* name; UChar* name_end; enum REF_NUM num_type; switch (c) { case '&': { PINC; name = p; r = fetch_name((OnigCodePoint )'(', &p, end, &name_end, env, &gnum, &num_type, FALSE); if (r < 0) return r; tok->type = TK_CALL; tok->u.call.by_number = 0; tok->u.call.gnum = 0; tok->u.call.name = name; tok->u.call.name_end = name_end; } break; case 'R': tok->type = TK_CALL; tok->u.call.by_number = 1; tok->u.call.gnum = 0; tok->u.call.name = p; PINC; if (! PPEEK_IS(')')) return ONIGERR_UNDEFINED_GROUP_OPTION; tok->u.call.name_end = p; break; case '-': case '+': goto lparen_qmark_num; break; default: if (! ONIGENC_IS_CODE_DIGIT(enc, c)) goto lparen_qmark_end; lparen_qmark_num: { name = p; r = fetch_name((OnigCodePoint )'(', &p, end, &name_end, env, &gnum, &num_type, TRUE); if (r < 0) return r; if (num_type == IS_NOT_NUM) { return ONIGERR_INVALID_GROUP_NAME; } else { if (num_type == IS_REL_NUM) { gnum = backref_rel_to_abs(gnum, env); if (gnum < 0) { onig_scan_env_set_error_string(env, ONIGERR_UNDEFINED_NAME_REFERENCE, name, name_end); return ONIGERR_UNDEFINED_GROUP_REFERENCE; } } tok->u.call.by_number = 1; tok->u.call.gnum = gnum; } tok->type = TK_CALL; tok->u.call.name = name; tok->u.call.name_end = name_end; } break; } } else if (c == 'P' && IS_SYNTAX_OP2(env->syntax, ONIG_SYN_OP2_QMARK_CAPITAL_P_NAME)) { PINC; /* skip 'P' */ if (PEND) return ONIGERR_END_PATTERN_IN_GROUP; PFETCH(c); allow_num = 0; if (c == '=') { c = '('; goto backref_start; } else if (c == '>') { #ifdef USE_CALL c = '('; goto call_start; #else return ONIGERR_UNDEFINED_OPERATOR; #endif } else { p = prev; goto lparen_qmark_end2; } } } lparen_qmark_end: PUNFETCH; } lparen_qmark_end2: if (! IS_SYNTAX_OP(syn, ONIG_SYN_OP_LPAREN_SUBEXP)) break; tok->type = TK_SUBEXP_OPEN; break; case ')': if (! IS_SYNTAX_OP(syn, ONIG_SYN_OP_LPAREN_SUBEXP)) break; tok->type = TK_SUBEXP_CLOSE; break; case '^': if (! IS_SYNTAX_OP(syn, ONIG_SYN_OP_LINE_ANCHOR)) break; tok->type = TK_ANCHOR; tok->u.subtype = (OPTON_SINGLELINE(env->options) ? ANCR_BEGIN_BUF : ANCR_BEGIN_LINE); break; case '$': if (! IS_SYNTAX_OP(syn, ONIG_SYN_OP_LINE_ANCHOR)) break; tok->type = TK_ANCHOR; tok->u.subtype = (OPTON_SINGLELINE(env->options) ? ANCR_SEMI_END_BUF : ANCR_END_LINE); break; case '[': if (! IS_SYNTAX_OP(syn, ONIG_SYN_OP_BRACKET_CC)) break; tok->type = TK_OPEN_CC; break; case ']': if (*src > env->pattern) /* /].../ is allowed. */ CLOSE_BRACKET_WITHOUT_ESC_WARN(env, (UChar* )"]"); break; case '#': if (OPTON_EXTEND(env->options)) { while (!PEND) { PFETCH(c); if (ONIGENC_IS_CODE_NEWLINE(enc, c)) break; } goto start; break; } break; case ' ': case '\t': case '\n': case '\r': case '\f': if (OPTON_EXTEND(env->options)) goto start; break; default: /* string */ break; } } out: *src = p; return tok->type; } static int add_ctype_to_cc_by_range(CClassNode* cc, int ctype ARG_UNUSED, int not, OnigEncoding enc ARG_UNUSED, OnigCodePoint sb_out, const OnigCodePoint mbr[]) { int i, r; OnigCodePoint j; int n = ONIGENC_CODE_RANGE_NUM(mbr); if (not == 0) { for (i = 0; i < n; i++) { for (j = ONIGENC_CODE_RANGE_FROM(mbr, i); j <= ONIGENC_CODE_RANGE_TO(mbr, i); j++) { if (j >= sb_out) { if (j > ONIGENC_CODE_RANGE_FROM(mbr, i)) { r = add_code_range_to_buf(&(cc->mbuf), j, ONIGENC_CODE_RANGE_TO(mbr, i)); if (r != 0) return r; i++; } goto sb_end; } BITSET_SET_BIT(cc->bs, j); } } sb_end: for ( ; i < n; i++) { r = add_code_range_to_buf(&(cc->mbuf), ONIGENC_CODE_RANGE_FROM(mbr, i), ONIGENC_CODE_RANGE_TO(mbr, i)); if (r != 0) return r; } } else { OnigCodePoint prev = 0; for (i = 0; i < n; i++) { for (j = prev; j < ONIGENC_CODE_RANGE_FROM(mbr, i); j++) { if (j >= sb_out) { goto sb_end2; } BITSET_SET_BIT(cc->bs, j); } prev = ONIGENC_CODE_RANGE_TO(mbr, i) + 1; } for (j = prev; j < sb_out; j++) { BITSET_SET_BIT(cc->bs, j); } sb_end2: prev = sb_out; for (i = 0; i < n; i++) { if (prev < ONIGENC_CODE_RANGE_FROM(mbr, i)) { r = add_code_range_to_buf(&(cc->mbuf), prev, ONIGENC_CODE_RANGE_FROM(mbr, i) - 1); if (r != 0) return r; } prev = ONIGENC_CODE_RANGE_TO(mbr, i) + 1; if (prev == 0) goto end; } r = add_code_range_to_buf(&(cc->mbuf), prev, MAX_CODE_POINT); if (r != 0) return r; } end: return 0; } static int add_ctype_to_cc_by_range_limit(CClassNode* cc, int ctype ARG_UNUSED, int not, OnigEncoding enc ARG_UNUSED, OnigCodePoint sb_out, const OnigCodePoint mbr[], OnigCodePoint limit) { int i, r; OnigCodePoint j; OnigCodePoint from; OnigCodePoint to; int n = ONIGENC_CODE_RANGE_NUM(mbr); if (not == 0) { for (i = 0; i < n; i++) { for (j = ONIGENC_CODE_RANGE_FROM(mbr, i); j <= ONIGENC_CODE_RANGE_TO(mbr, i); j++) { if (j > limit) goto end; if (j >= sb_out) { if (j > ONIGENC_CODE_RANGE_FROM(mbr, i)) { to = ONIGENC_CODE_RANGE_TO(mbr, i); if (to > limit) to = limit; r = add_code_range_to_buf(&(cc->mbuf), j, to); if (r != 0) return r; i++; } goto sb_end; } BITSET_SET_BIT(cc->bs, j); } } sb_end: for ( ; i < n; i++) { from = ONIGENC_CODE_RANGE_FROM(mbr, i); to = ONIGENC_CODE_RANGE_TO(mbr, i); if (from > limit) break; if (to > limit) to = limit; r = add_code_range_to_buf(&(cc->mbuf), from, to); if (r != 0) return r; } } else { OnigCodePoint prev = 0; for (i = 0; i < n; i++) { from = ONIGENC_CODE_RANGE_FROM(mbr, i); if (from > limit) { for (j = prev; j < sb_out; j++) { BITSET_SET_BIT(cc->bs, j); } goto sb_end2; } for (j = prev; j < from; j++) { if (j >= sb_out) goto sb_end2; BITSET_SET_BIT(cc->bs, j); } prev = ONIGENC_CODE_RANGE_TO(mbr, i); if (prev > limit) prev = limit; prev++; if (prev == 0) goto end; } for (j = prev; j < sb_out; j++) { BITSET_SET_BIT(cc->bs, j); } sb_end2: prev = sb_out; for (i = 0; i < n; i++) { from = ONIGENC_CODE_RANGE_FROM(mbr, i); if (from > limit) goto last; if (prev < from) { r = add_code_range_to_buf(&(cc->mbuf), prev, from - 1); if (r != 0) return r; } prev = ONIGENC_CODE_RANGE_TO(mbr, i); if (prev > limit) prev = limit; prev++; if (prev == 0) goto end; } last: r = add_code_range_to_buf(&(cc->mbuf), prev, MAX_CODE_POINT); if (r != 0) return r; } end: return 0; } static int add_ctype_to_cc(CClassNode* cc, int ctype, int not, ParseEnv* env) { int c, r; int ascii_mode; int is_single; const OnigCodePoint *ranges; OnigCodePoint limit; OnigCodePoint sb_out; OnigEncoding enc = env->enc; ascii_mode = OPTON_IS_ASCII_MODE_CTYPE(ctype, env->options); r = ONIGENC_GET_CTYPE_CODE_RANGE(enc, ctype, &sb_out, &ranges); if (r == 0) { if (ascii_mode == 0) r = add_ctype_to_cc_by_range(cc, ctype, not, env->enc, sb_out, ranges); else r = add_ctype_to_cc_by_range_limit(cc, ctype, not, env->enc, sb_out, ranges, ASCII_LIMIT); return r; } else if (r != ONIG_NO_SUPPORT_CONFIG) { return r; } r = 0; is_single = ONIGENC_IS_SINGLEBYTE(enc); limit = ascii_mode ? ASCII_LIMIT : SINGLE_BYTE_SIZE; switch (ctype) { case ONIGENC_CTYPE_ALPHA: case ONIGENC_CTYPE_BLANK: case ONIGENC_CTYPE_CNTRL: case ONIGENC_CTYPE_DIGIT: case ONIGENC_CTYPE_LOWER: case ONIGENC_CTYPE_PUNCT: case ONIGENC_CTYPE_SPACE: case ONIGENC_CTYPE_UPPER: case ONIGENC_CTYPE_XDIGIT: case ONIGENC_CTYPE_ASCII: case ONIGENC_CTYPE_ALNUM: if (not != 0) { for (c = 0; c < (int )limit; c++) { if (is_single != 0 || ONIGENC_CODE_TO_MBCLEN(enc, c) == 1) { if (! ONIGENC_IS_CODE_CTYPE(enc, (OnigCodePoint )c, ctype)) BITSET_SET_BIT(cc->bs, c); } } for (c = limit; c < SINGLE_BYTE_SIZE; c++) { if (is_single != 0 || ONIGENC_CODE_TO_MBCLEN(enc, c) == 1) BITSET_SET_BIT(cc->bs, c); } if (is_single == 0) ADD_ALL_MULTI_BYTE_RANGE(enc, cc->mbuf); } else { for (c = 0; c < (int )limit; c++) { if (is_single != 0 || ONIGENC_CODE_TO_MBCLEN(enc, c) == 1) { if (ONIGENC_IS_CODE_CTYPE(enc, (OnigCodePoint )c, ctype)) BITSET_SET_BIT(cc->bs, c); } } } break; case ONIGENC_CTYPE_GRAPH: case ONIGENC_CTYPE_PRINT: case ONIGENC_CTYPE_WORD: if (not != 0) { for (c = 0; c < (int )limit; c++) { /* check invalid code point */ if ((is_single != 0 || ONIGENC_CODE_TO_MBCLEN(enc, c) == 1) && ! ONIGENC_IS_CODE_CTYPE(enc, (OnigCodePoint )c, ctype)) BITSET_SET_BIT(cc->bs, c); } for (c = limit; c < SINGLE_BYTE_SIZE; c++) { if (is_single != 0 || ONIGENC_CODE_TO_MBCLEN(enc, c) == 1) BITSET_SET_BIT(cc->bs, c); } if (ascii_mode != 0 && is_single == 0) ADD_ALL_MULTI_BYTE_RANGE(enc, cc->mbuf); } else { for (c = 0; c < (int )limit; c++) { if ((is_single != 0 || ONIGENC_CODE_TO_MBCLEN(enc, c) == 1) && ONIGENC_IS_CODE_CTYPE(enc, (OnigCodePoint )c, ctype)) BITSET_SET_BIT(cc->bs, c); } if (ascii_mode == 0 && is_single == 0) ADD_ALL_MULTI_BYTE_RANGE(enc, cc->mbuf); } break; default: return ONIGERR_PARSER_BUG; break; } return r; } static int prs_posix_bracket(CClassNode* cc, UChar** src, UChar* end, ParseEnv* env) { #define POSIX_BRACKET_CHECK_LIMIT_LENGTH 20 #define POSIX_BRACKET_NAME_MIN_LEN 4 static PosixBracketEntryType PBS[] = { { (UChar* )"alnum", ONIGENC_CTYPE_ALNUM, 5 }, { (UChar* )"alpha", ONIGENC_CTYPE_ALPHA, 5 }, { (UChar* )"blank", ONIGENC_CTYPE_BLANK, 5 }, { (UChar* )"cntrl", ONIGENC_CTYPE_CNTRL, 5 }, { (UChar* )"digit", ONIGENC_CTYPE_DIGIT, 5 }, { (UChar* )"graph", ONIGENC_CTYPE_GRAPH, 5 }, { (UChar* )"lower", ONIGENC_CTYPE_LOWER, 5 }, { (UChar* )"print", ONIGENC_CTYPE_PRINT, 5 }, { (UChar* )"punct", ONIGENC_CTYPE_PUNCT, 5 }, { (UChar* )"space", ONIGENC_CTYPE_SPACE, 5 }, { (UChar* )"upper", ONIGENC_CTYPE_UPPER, 5 }, { (UChar* )"xdigit", ONIGENC_CTYPE_XDIGIT, 6 }, { (UChar* )"ascii", ONIGENC_CTYPE_ASCII, 5 }, { (UChar* )"word", ONIGENC_CTYPE_WORD, 4 }, { (UChar* )NULL, -1, 0 } }; PosixBracketEntryType *pb; int not, i, r; OnigCodePoint c; OnigEncoding enc = env->enc; UChar *p = *src; if (PPEEK_IS('^')) { PINC_S; not = 1; } else not = 0; if (onigenc_strlen(enc, p, end) < POSIX_BRACKET_NAME_MIN_LEN + 3) goto not_posix_bracket; for (pb = PBS; IS_NOT_NULL(pb->name); pb++) { if (onigenc_with_ascii_strncmp(enc, p, end, pb->name, pb->len) == 0) { p = (UChar* )onigenc_step(enc, p, end, pb->len); if (onigenc_with_ascii_strncmp(enc, p, end, (UChar* )":]", 2) != 0) return ONIGERR_INVALID_POSIX_BRACKET_TYPE; r = add_ctype_to_cc(cc, pb->ctype, not, env); if (r != 0) return r; PINC_S; PINC_S; *src = p; return 0; } } not_posix_bracket: c = 0; i = 0; while (!PEND && ((c = PPEEK) != ':') && c != ']') { PINC_S; if (++i > POSIX_BRACKET_CHECK_LIMIT_LENGTH) break; } if (c == ':' && ! PEND) { PINC_S; if (! PEND) { PFETCH_S(c); if (c == ']') return ONIGERR_INVALID_POSIX_BRACKET_TYPE; } } return 1; /* 1: is not POSIX bracket, but no error. */ } static int fetch_char_property_to_ctype(UChar** src, UChar* end, ParseEnv* env) { int r; OnigCodePoint c; OnigEncoding enc; UChar *prev, *start, *p; p = *src; enc = env->enc; r = ONIGERR_END_PATTERN_WITH_UNMATCHED_PARENTHESIS; start = prev = p; while (!PEND) { prev = p; PFETCH_S(c); if (c == '}') { r = ONIGENC_PROPERTY_NAME_TO_CTYPE(enc, start, prev); if (r >= 0) { *src = p; } else { onig_scan_env_set_error_string(env, r, *src, prev); } return r; } else if (c == '(' || c == ')' || c == '{' || c == '|') { break; } } return r; } static int prs_char_property(Node** np, PToken* tok, UChar** src, UChar* end, ParseEnv* env) { int r, ctype; CClassNode* cc; ctype = fetch_char_property_to_ctype(src, end, env); if (ctype < 0) return ctype; *np = node_new_cclass(); CHECK_NULL_RETURN_MEMERR(*np); cc = CCLASS_(*np); r = add_ctype_to_cc(cc, ctype, FALSE, env); if (r != 0) return r; if (tok->u.prop.not != 0) NCCLASS_SET_NOT(cc); return 0; } static int cc_cprop_next(CClassNode* cc, OnigCodePoint* pcode, CVAL* val, CSTATE* state, ParseEnv* env) { int r; if (*state == CS_RANGE) return ONIGERR_CHAR_CLASS_VALUE_AT_END_OF_RANGE; if (*state == CS_VALUE) { if (*val == CV_SB) BITSET_SET_BIT(cc->bs, (int )(*pcode)); else if (*val == CV_MB) { r = add_code_range(&(cc->mbuf), env, *pcode, *pcode); if (r < 0) return r; } } *state = CS_VALUE; *val = CV_CPROP; return 0; } static int cc_char_next(CClassNode* cc, OnigCodePoint *from, OnigCodePoint to, int* from_raw, int to_raw, CVAL intype, CVAL* type, CSTATE* state, ParseEnv* env) { int r; switch (*state) { case CS_VALUE: if (*type == CV_SB) { if (*from > 0xff) return ONIGERR_INVALID_CODE_POINT_VALUE; BITSET_SET_BIT(cc->bs, (int )(*from)); } else if (*type == CV_MB) { r = add_code_range(&(cc->mbuf), env, *from, *from); if (r < 0) return r; } break; case CS_RANGE: if (intype == *type) { if (intype == CV_SB) { if (*from > 0xff || to > 0xff) return ONIGERR_INVALID_CODE_POINT_VALUE; if (*from > to) { if (IS_SYNTAX_BV(env->syntax, ONIG_SYN_ALLOW_EMPTY_RANGE_IN_CC)) goto ccs_range_end; else return ONIGERR_EMPTY_RANGE_IN_CHAR_CLASS; } bitset_set_range(cc->bs, (int )*from, (int )to); } else { r = add_code_range(&(cc->mbuf), env, *from, to); if (r < 0) return r; } } else { if (*from > to) { if (IS_SYNTAX_BV(env->syntax, ONIG_SYN_ALLOW_EMPTY_RANGE_IN_CC)) goto ccs_range_end; else return ONIGERR_EMPTY_RANGE_IN_CHAR_CLASS; } bitset_set_range(cc->bs, (int )*from, (int )(to < 0xff ? to : 0xff)); r = add_code_range(&(cc->mbuf), env, (OnigCodePoint )*from, to); if (r < 0) return r; } ccs_range_end: *state = CS_COMPLETE; break; case CS_COMPLETE: case CS_START: *state = CS_VALUE; break; default: break; } *from_raw = to_raw; *from = to; *type = intype; return 0; } static int code_exist_check(OnigCodePoint c, UChar* from, UChar* end, int ignore_escaped, ParseEnv* env) { int in_esc; OnigCodePoint code; OnigEncoding enc = env->enc; UChar* p = from; in_esc = 0; while (! PEND) { if (ignore_escaped && in_esc) { in_esc = 0; } else { PFETCH_S(code); if (code == c) return 1; if (code == MC_ESC(env->syntax)) in_esc = 1; } } return 0; } static int prs_cc(Node** np, PToken* tok, UChar** src, UChar* end, ParseEnv* env) { int r, neg, len, fetched, and_start; OnigCodePoint in_code, curr_code; UChar *p; Node* node; CClassNode *cc, *prev_cc; CClassNode work_cc; int curr_raw, in_raw; CSTATE state; CVAL in_type; CVAL curr_type; *np = NULL_NODE; INC_PARSE_DEPTH(env->parse_depth); state = CS_START; prev_cc = (CClassNode* )NULL; r = fetch_token_cc(tok, src, end, env, state); if (r == TK_CHAR && tok->u.code == (OnigCodePoint )'^' && tok->escaped == 0) { neg = 1; r = fetch_token_cc(tok, src, end, env, state); } else { neg = 0; } if (r < 0) return r; if (r == TK_CC_CLOSE) { if (! code_exist_check((OnigCodePoint )']', *src, env->pattern_end, 1, env)) return ONIGERR_EMPTY_CHAR_CLASS; CC_ESC_WARN(env, (UChar* )"]"); r = tok->type = TK_CHAR; /* allow []...] */ } *np = node = node_new_cclass(); CHECK_NULL_RETURN_MEMERR(node); cc = CCLASS_(node); and_start = 0; curr_type = CV_UNDEF; p = *src; while (r != TK_CC_CLOSE) { fetched = 0; switch (r) { case TK_CHAR: any_char_in: len = ONIGENC_CODE_TO_MBCLEN(env->enc, tok->u.code); if (len < 0) { r = len; goto err; } in_type = (len == 1) ? CV_SB : CV_MB; in_code = tok->u.code; in_raw = 0; goto val_entry2; break; case TK_CRUDE_BYTE: /* tok->base_num != 0 : octal or hexadec. */ if (! ONIGENC_IS_SINGLEBYTE(env->enc) && tok->base_num != 0) { int i, j; UChar buf[ONIGENC_CODE_TO_MBC_MAXLEN]; UChar* bufe = buf + ONIGENC_CODE_TO_MBC_MAXLEN; UChar* psave = p; int base_num = tok->base_num; buf[0] = tok->u.byte; for (i = 1; i < ONIGENC_MBC_MAXLEN(env->enc); i++) { r = fetch_token_cc(tok, &p, end, env, CS_COMPLETE); if (r < 0) goto err; if (r != TK_CRUDE_BYTE || tok->base_num != base_num) { fetched = 1; break; } buf[i] = tok->u.byte; } if (i < ONIGENC_MBC_MINLEN(env->enc)) { r = ONIGERR_TOO_SHORT_MULTI_BYTE_STRING; goto err; } /* clear buf tail */ for (j = i; j < ONIGENC_CODE_TO_MBC_MAXLEN; j++) buf[j] = '\0'; len = enclen(env->enc, buf); if (i < len) { r = ONIGERR_TOO_SHORT_MULTI_BYTE_STRING; goto err; } else if (i > len) { /* fetch back */ p = psave; for (i = 1; i < len; i++) { r = fetch_token_cc(tok, &p, end, env, CS_COMPLETE); if (r < 0) goto err; } fetched = 0; } if (i == 1) { in_code = (OnigCodePoint )buf[0]; goto crude_single; } else { in_code = ONIGENC_MBC_TO_CODE(env->enc, buf, bufe); in_type = CV_MB; } } else { in_code = (OnigCodePoint )tok->u.byte; crude_single: in_type = CV_SB; } in_raw = 1; goto val_entry2; break; case TK_CODE_POINT: in_code = tok->u.code; in_raw = 1; val_entry: len = ONIGENC_CODE_TO_MBCLEN(env->enc, in_code); if (len < 0) { if (state != CS_RANGE || ! IS_SYNTAX_BV(env->syntax, ONIG_SYN_ALLOW_INVALID_CODE_END_OF_RANGE_IN_CC) || in_code < 0x100 || ONIGENC_MBC_MAXLEN(env->enc) == 1) { r = len; goto err; } } in_type = (len == 1 ? CV_SB : CV_MB); val_entry2: r = cc_char_next(cc, &curr_code, in_code, &curr_raw, in_raw, in_type, &curr_type, &state, env); if (r != 0) goto err; break; case TK_CC_POSIX_BRACKET_OPEN: r = prs_posix_bracket(cc, &p, end, env); if (r < 0) goto err; if (r == 1) { /* is not POSIX bracket */ CC_ESC_WARN(env, (UChar* )"["); p = tok->backp; in_code = tok->u.code; in_raw = 0; goto val_entry; } goto next_cprop; break; case TK_CHAR_TYPE: r = add_ctype_to_cc(cc, tok->u.prop.ctype, tok->u.prop.not, env); if (r != 0) goto err; next_cprop: r = cc_cprop_next(cc, &curr_code, &curr_type, &state, env); if (r != 0) goto err; break; case TK_CHAR_PROPERTY: { int ctype = fetch_char_property_to_ctype(&p, end, env); if (ctype < 0) { r = ctype; goto err; } r = add_ctype_to_cc(cc, ctype, tok->u.prop.not, env); if (r != 0) goto err; goto next_cprop; } break; case TK_CC_RANGE: if (state == CS_VALUE) { r = fetch_token_cc(tok, &p, end, env, CS_RANGE); if (r < 0) goto err; fetched = 1; if (r == TK_CC_CLOSE) { /* allow [x-] */ range_end_val: in_code = (OnigCodePoint )'-'; in_raw = 0; goto val_entry; } else if (r == TK_CC_AND) { CC_ESC_WARN(env, (UChar* )"-"); goto range_end_val; } if (curr_type == CV_CPROP) { r = ONIGERR_UNMATCHED_RANGE_SPECIFIER_IN_CHAR_CLASS; goto err; } state = CS_RANGE; } else if (state == CS_START) { /* [-xa] is allowed */ in_code = tok->u.code; in_raw = 0; r = fetch_token_cc(tok, &p, end, env, CS_VALUE); if (r < 0) goto err; fetched = 1; /* [--x] or [a&&-x] is warned. */ if (r == TK_CC_RANGE || and_start != 0) CC_ESC_WARN(env, (UChar* )"-"); goto val_entry; } else if (state == CS_RANGE) { CC_ESC_WARN(env, (UChar* )"-"); goto any_char_in; /* [!--] is allowed */ } else { /* CS_COMPLETE */ r = fetch_token_cc(tok, &p, end, env, CS_VALUE); if (r < 0) goto err; fetched = 1; if (r == TK_CC_CLOSE) goto range_end_val; /* allow [a-b-] */ else if (r == TK_CC_AND) { CC_ESC_WARN(env, (UChar* )"-"); goto range_end_val; } if (IS_SYNTAX_BV(env->syntax, ONIG_SYN_ALLOW_DOUBLE_RANGE_OP_IN_CC)) { CC_ESC_WARN(env, (UChar* )"-"); goto range_end_val; /* [0-9-a] is allowed as [0-9\-a] */ } r = ONIGERR_UNMATCHED_RANGE_SPECIFIER_IN_CHAR_CLASS; goto err; } break; case TK_CC_OPEN_CC: /* [ */ { Node *anode; CClassNode* acc; if (state == CS_VALUE) { r = cc_char_next(cc, &curr_code, 0, &curr_raw, 0, curr_type, &curr_type, &state, env); if (r != 0) goto err; } state = CS_COMPLETE; r = prs_cc(&anode, tok, &p, end, env); if (r != 0) { onig_node_free(anode); goto cc_open_err; } acc = CCLASS_(anode); r = or_cclass(cc, acc, env->enc); onig_node_free(anode); cc_open_err: if (r != 0) goto err; } break; case TK_CC_AND: /* && */ { if (state == CS_VALUE) { r = cc_char_next(cc, &curr_code, 0, &curr_raw, 0, curr_type, &curr_type, &state, env); if (r != 0) goto err; } /* initialize local variables */ and_start = 1; state = CS_START; if (IS_NOT_NULL(prev_cc)) { r = and_cclass(prev_cc, cc, env->enc); if (r != 0) goto err; bbuf_free(cc->mbuf); } else { prev_cc = cc; cc = &work_cc; } initialize_cclass(cc); } break; case TK_EOT: r = ONIGERR_PREMATURE_END_OF_CHAR_CLASS; goto err; break; default: r = ONIGERR_PARSER_BUG; goto err; break; } if (fetched) r = tok->type; else { r = fetch_token_cc(tok, &p, end, env, state); if (r < 0) goto err; } } if (state == CS_VALUE) { r = cc_char_next(cc, &curr_code, 0, &curr_raw, 0, curr_type, &curr_type, &state, env); if (r != 0) goto err; } if (IS_NOT_NULL(prev_cc)) { r = and_cclass(prev_cc, cc, env->enc); if (r != 0) goto err; bbuf_free(cc->mbuf); cc = prev_cc; } if (neg != 0) NCCLASS_SET_NOT(cc); else NCCLASS_CLEAR_NOT(cc); if (IS_NCCLASS_NOT(cc) && IS_SYNTAX_BV(env->syntax, ONIG_SYN_NOT_NEWLINE_IN_NEGATIVE_CC)) { int is_empty = (IS_NULL(cc->mbuf) ? 1 : 0); if (is_empty != 0) BITSET_IS_EMPTY(cc->bs, is_empty); if (is_empty == 0) { if (ONIGENC_IS_CODE_NEWLINE(env->enc, NEWLINE_CODE)) { if (ONIGENC_CODE_TO_MBCLEN(env->enc, NEWLINE_CODE) == 1) BITSET_SET_BIT(cc->bs, NEWLINE_CODE); else add_code_range(&(cc->mbuf), env, NEWLINE_CODE, NEWLINE_CODE); } } } *src = p; DEC_PARSE_DEPTH(env->parse_depth); return 0; err: if (cc != CCLASS_(*np)) bbuf_free(cc->mbuf); return r; } static int prs_alts(Node** top, PToken* tok, int term, UChar** src, UChar* end, ParseEnv* env, int group_head); #ifdef USE_CALLOUT /* (?{...}[tag][+-]) (?{{...}}[tag][+-]) */ static int prs_callout_of_contents(Node** np, int cterm, UChar** src, UChar* end, ParseEnv* env) { int r; int i; int in; int num; OnigCodePoint c; UChar* code_start; UChar* code_end; UChar* contents; UChar* tag_start; UChar* tag_end; int brace_nest; CalloutListEntry* e; RegexExt* ext; OnigEncoding enc = env->enc; UChar* p = *src; if (PEND) return ONIGERR_INVALID_CALLOUT_PATTERN; brace_nest = 0; while (PPEEK_IS('{')) { brace_nest++; PINC_S; if (PEND) return ONIGERR_INVALID_CALLOUT_PATTERN; } in = ONIG_CALLOUT_IN_PROGRESS; code_start = p; while (1) { if (PEND) return ONIGERR_INVALID_CALLOUT_PATTERN; code_end = p; PFETCH_S(c); if (c == '}') { i = brace_nest; while (i > 0) { if (PEND) return ONIGERR_INVALID_CALLOUT_PATTERN; PFETCH_S(c); if (c == '}') i--; else break; } if (i == 0) break; } } if (PEND) return ONIGERR_END_PATTERN_IN_GROUP; PFETCH_S(c); if (c == '[') { if (PEND) return ONIGERR_END_PATTERN_IN_GROUP; tag_end = tag_start = p; while (! PEND) { if (PEND) return ONIGERR_END_PATTERN_IN_GROUP; tag_end = p; PFETCH_S(c); if (c == ']') break; } if (! is_allowed_callout_tag_name(enc, tag_start, tag_end)) return ONIGERR_INVALID_CALLOUT_TAG_NAME; if (PEND) return ONIGERR_END_PATTERN_IN_GROUP; PFETCH_S(c); } else { tag_start = tag_end = 0; } if (c == 'X') { in |= ONIG_CALLOUT_IN_RETRACTION; if (PEND) return ONIGERR_END_PATTERN_IN_GROUP; PFETCH_S(c); } else if (c == '<') { in = ONIG_CALLOUT_IN_RETRACTION; if (PEND) return ONIGERR_END_PATTERN_IN_GROUP; PFETCH_S(c); } else if (c == '>') { /* no needs (default) */ if (PEND) return ONIGERR_END_PATTERN_IN_GROUP; PFETCH_S(c); } if (c != cterm) return ONIGERR_INVALID_CALLOUT_PATTERN; r = reg_callout_list_entry(env, &num); if (r != 0) return r; ext = onig_get_regex_ext(env->reg); CHECK_NULL_RETURN_MEMERR(ext); if (IS_NULL(ext->pattern)) { r = onig_ext_set_pattern(env->reg, env->pattern, env->pattern_end); if (r != ONIG_NORMAL) return r; } if (tag_start != tag_end) { r = callout_tag_entry(env, env->reg, tag_start, tag_end, num); if (r != ONIG_NORMAL) return r; } contents = onigenc_strdup(enc, code_start, code_end); CHECK_NULL_RETURN_MEMERR(contents); e = onig_reg_callout_list_at(env->reg, num); if (IS_NULL(e)) { xfree(contents); return ONIGERR_MEMORY; } r = node_new_callout(np, ONIG_CALLOUT_OF_CONTENTS, num, ONIG_NON_NAME_ID, env); if (r != 0) { xfree(contents); return r; } e->of = ONIG_CALLOUT_OF_CONTENTS; e->in = in; e->name_id = ONIG_NON_NAME_ID; e->u.content.start = contents; e->u.content.end = contents + (code_end - code_start); *src = p; return 0; } static long prs_long(OnigEncoding enc, UChar* s, UChar* end, int sign_on, long max, long* rl) { long v; long d; int flag; UChar* p; OnigCodePoint c; if (s >= end) return ONIGERR_INVALID_CALLOUT_ARG; flag = 1; v = 0; p = s; while (p < end) { c = ONIGENC_MBC_TO_CODE(enc, p, end); p += ONIGENC_MBC_ENC_LEN(enc, p); if (c >= '0' && c <= '9') { d = (long )(c - '0'); if (v > (max - d) / 10) return ONIGERR_INVALID_CALLOUT_ARG; v = v * 10 + d; } else if (sign_on != 0 && (c == '-' || c == '+')) { if (c == '-') flag = -1; } else return ONIGERR_INVALID_CALLOUT_ARG; sign_on = 0; } *rl = flag * v; return ONIG_NORMAL; } static void clear_callout_args(int n, unsigned int types[], OnigValue vals[]) { int i; for (i = 0; i < n; i++) { switch (types[i]) { case ONIG_TYPE_STRING: if (IS_NOT_NULL(vals[i].s.start)) xfree(vals[i].s.start); break; default: break; } } } static int prs_callout_args(int skip_mode, int cterm, UChar** src, UChar* end, int max_arg_num, unsigned int types[], OnigValue vals[], ParseEnv* env) { #define MAX_CALLOUT_ARG_BYTE_LENGTH 128 int r; int n; int esc; int cn; UChar* s; UChar* e; UChar* eesc; OnigCodePoint c; UChar* bufend; UChar buf[MAX_CALLOUT_ARG_BYTE_LENGTH]; OnigEncoding enc = env->enc; UChar* p = *src; if (PEND) return ONIGERR_INVALID_CALLOUT_PATTERN; c = 0; n = 0; while (n < ONIG_CALLOUT_MAX_ARGS_NUM) { cn = 0; esc = 0; eesc = 0; bufend = buf; s = e = p; while (1) { if (PEND) { r = ONIGERR_INVALID_CALLOUT_PATTERN; goto err_clear; } e = p; PFETCH_S(c); if (esc != 0) { esc = 0; if (c == '\\' || c == cterm || c == ',') { /* */ } else { e = eesc; cn++; } goto add_char; } else { if (c == '\\') { esc = 1; eesc = e; } else if (c == cterm || c == ',') break; else { size_t clen; add_char: if (skip_mode == FALSE) { clen = p - e; if (bufend + clen > buf + MAX_CALLOUT_ARG_BYTE_LENGTH) { r = ONIGERR_INVALID_CALLOUT_ARG; /* too long argument */ goto err_clear; } xmemcpy(bufend, e, clen); bufend += clen; } cn++; } } } if (cn != 0) { if (max_arg_num >= 0 && n >= max_arg_num) { r = ONIGERR_INVALID_CALLOUT_ARG; goto err_clear; } if (skip_mode == FALSE) { if ((types[n] & ONIG_TYPE_LONG) != 0) { int fixed = 0; if (cn > 0) { long rl; r = prs_long(enc, buf, bufend, 1, LONG_MAX, &rl); if (r == ONIG_NORMAL) { vals[n].l = rl; fixed = 1; types[n] = ONIG_TYPE_LONG; } } if (fixed == 0) { types[n] = (types[n] & ~ONIG_TYPE_LONG); if (types[n] == ONIG_TYPE_VOID) { r = ONIGERR_INVALID_CALLOUT_ARG; goto err_clear; } } } switch (types[n]) { case ONIG_TYPE_LONG: break; case ONIG_TYPE_CHAR: if (cn != 1) { r = ONIGERR_INVALID_CALLOUT_ARG; goto err_clear; } vals[n].c = ONIGENC_MBC_TO_CODE(enc, buf, bufend); break; case ONIG_TYPE_STRING: { UChar* rs = onigenc_strdup(enc, buf, bufend); if (IS_NULL(rs)) { r = ONIGERR_MEMORY; goto err_clear; } vals[n].s.start = rs; vals[n].s.end = rs + (e - s); } break; case ONIG_TYPE_TAG: if (eesc != 0 || ! is_allowed_callout_tag_name(enc, s, e)) { r = ONIGERR_INVALID_CALLOUT_TAG_NAME; goto err_clear; } vals[n].s.start = s; vals[n].s.end = e; break; case ONIG_TYPE_VOID: case ONIG_TYPE_POINTER: r = ONIGERR_PARSER_BUG; goto err_clear; break; } } n++; } if (c == cterm) break; } if (c != cterm) { r = ONIGERR_INVALID_CALLOUT_PATTERN; goto err_clear; } *src = p; return n; err_clear: if (skip_mode == FALSE) clear_callout_args(n, types, vals); return r; } /* (*name[TAG]) (*name[TAG]{a,b,..}) */ static int prs_callout_of_name(Node** np, int cterm, UChar** src, UChar* end, ParseEnv* env) { int r; int i; int in; int num; int name_id; int arg_num; int max_arg_num; int opt_arg_num; int is_not_single; OnigCodePoint c; UChar* name_start; UChar* name_end; UChar* tag_start; UChar* tag_end; Node* node; CalloutListEntry* e; RegexExt* ext; unsigned int types[ONIG_CALLOUT_MAX_ARGS_NUM]; OnigValue vals[ONIG_CALLOUT_MAX_ARGS_NUM]; OnigEncoding enc = env->enc; UChar* p = *src; /* PFETCH_READY; */ if (PEND) return ONIGERR_INVALID_CALLOUT_PATTERN; node = 0; name_start = p; while (1) { if (PEND) return ONIGERR_END_PATTERN_IN_GROUP; name_end = p; PFETCH_S(c); if (c == cterm || c == '[' || c == '{') break; } if (! is_allowed_callout_name(enc, name_start, name_end)) return ONIGERR_INVALID_CALLOUT_NAME; if (c == '[') { if (PEND) return ONIGERR_END_PATTERN_IN_GROUP; tag_end = tag_start = p; while (! PEND) { if (PEND) return ONIGERR_END_PATTERN_IN_GROUP; tag_end = p; PFETCH_S(c); if (c == ']') break; } if (! is_allowed_callout_tag_name(enc, tag_start, tag_end)) return ONIGERR_INVALID_CALLOUT_TAG_NAME; if (PEND) return ONIGERR_END_PATTERN_IN_GROUP; PFETCH_S(c); } else { tag_start = tag_end = 0; } if (c == '{') { UChar* save; if (PEND) return ONIGERR_END_PATTERN_IN_GROUP; /* read for single check only */ save = p; arg_num = prs_callout_args(TRUE, '}', &p, end, -1, NULL, NULL, env); if (arg_num < 0) return arg_num; is_not_single = PPEEK_IS(cterm) ? 0 : 1; p = save; r = get_callout_name_id_by_name(enc, is_not_single, name_start, name_end, &name_id); if (r != ONIG_NORMAL) return r; max_arg_num = get_callout_arg_num_by_name_id(name_id); for (i = 0; i < max_arg_num; i++) { types[i] = get_callout_arg_type_by_name_id(name_id, i); } arg_num = prs_callout_args(FALSE, '}', &p, end, max_arg_num, types, vals, env); if (arg_num < 0) return arg_num; if (PEND) { r = ONIGERR_END_PATTERN_IN_GROUP; goto err_clear; } PFETCH_S(c); } else { arg_num = 0; is_not_single = 0; r = get_callout_name_id_by_name(enc, is_not_single, name_start, name_end, &name_id); if (r != ONIG_NORMAL) return r; max_arg_num = get_callout_arg_num_by_name_id(name_id); for (i = 0; i < max_arg_num; i++) { types[i] = get_callout_arg_type_by_name_id(name_id, i); } } in = onig_get_callout_in_by_name_id(name_id); opt_arg_num = get_callout_opt_arg_num_by_name_id(name_id); if (arg_num > max_arg_num || arg_num < (max_arg_num - opt_arg_num)) { r = ONIGERR_INVALID_CALLOUT_ARG; goto err_clear; } if (c != cterm) { r = ONIGERR_INVALID_CALLOUT_PATTERN; goto err_clear; } r = reg_callout_list_entry(env, &num); if (r != 0) goto err_clear; ext = onig_get_regex_ext(env->reg); if (IS_NULL(ext)) { r = ONIGERR_MEMORY; goto err_clear; } if (IS_NULL(ext->pattern)) { r = onig_ext_set_pattern(env->reg, env->pattern, env->pattern_end); if (r != ONIG_NORMAL) goto err_clear; } if (tag_start != tag_end) { r = callout_tag_entry(env, env->reg, tag_start, tag_end, num); if (r != ONIG_NORMAL) goto err_clear; } e = onig_reg_callout_list_at(env->reg, num); if (IS_NULL(e)) { r = ONIGERR_MEMORY; goto err_clear; } r = node_new_callout(&node, ONIG_CALLOUT_OF_NAME, num, name_id, env); if (r != ONIG_NORMAL) goto err_clear; e->of = ONIG_CALLOUT_OF_NAME; e->in = in; e->name_id = name_id; e->type = onig_get_callout_type_by_name_id(name_id); e->start_func = onig_get_callout_start_func_by_name_id(name_id); e->end_func = onig_get_callout_end_func_by_name_id(name_id); e->u.arg.num = max_arg_num; e->u.arg.passed_num = arg_num; for (i = 0; i < max_arg_num; i++) { e->u.arg.types[i] = types[i]; if (i < arg_num) e->u.arg.vals[i] = vals[i]; else e->u.arg.vals[i] = get_callout_opt_default_by_name_id(name_id, i); } *np = node; *src = p; return 0; err_clear: clear_callout_args(arg_num, types, vals); return r; } #endif static int prs_bag(Node** np, PToken* tok, int term, UChar** src, UChar* end, ParseEnv* env) { int r, num; Node *target; OnigOptionType option; OnigCodePoint c; int list_capture; OnigEncoding enc; UChar* p; PFETCH_READY; p = *src; enc = env->enc; *np = NULL; if (PEND) return ONIGERR_END_PATTERN_WITH_UNMATCHED_PARENTHESIS; option = env->options; c = PPEEK; if (c == '?' && IS_SYNTAX_OP2(env->syntax, ONIG_SYN_OP2_QMARK_GROUP_EFFECT)) { PINC; if (PEND) return ONIGERR_END_PATTERN_IN_GROUP; PFETCH(c); switch (c) { case ':': /* (?:...) grouping only */ group: r = fetch_token(tok, &p, end, env); if (r < 0) return r; r = prs_alts(np, tok, term, &p, end, env, FALSE); if (r < 0) return r; *src = p; return 1; /* group */ break; case '=': *np = node_new_anchor(ANCR_PREC_READ); break; case '!': /* preceding read */ *np = node_new_anchor(ANCR_PREC_READ_NOT); break; case '>': /* (?>...) stop backtrack */ *np = node_new_bag(BAG_STOP_BACKTRACK); break; case '\'': if (IS_SYNTAX_OP2(env->syntax, ONIG_SYN_OP2_QMARK_LT_NAMED_GROUP)) { goto named_group1; } else return ONIGERR_UNDEFINED_GROUP_OPTION; break; case '<': /* look behind (?<=...), (?syntax, ONIG_SYN_OP2_QMARK_LT_NAMED_GROUP)) { UChar *name; UChar *name_end; enum REF_NUM num_type; PUNFETCH; c = '<'; named_group1: list_capture = 0; #ifdef USE_CAPTURE_HISTORY named_group2: #endif name = p; r = fetch_name((OnigCodePoint )c, &p, end, &name_end, env, &num, &num_type, FALSE); if (r < 0) return r; num = scan_env_add_mem_entry(env); if (num < 0) return num; if (list_capture != 0 && num >= (int )MEM_STATUS_BITS_NUM) return ONIGERR_GROUP_NUMBER_OVER_FOR_CAPTURE_HISTORY; r = name_add(env->reg, name, name_end, num, env); if (r != 0) return r; *np = node_new_memory(1); CHECK_NULL_RETURN_MEMERR(*np); BAG_(*np)->m.regnum = num; if (list_capture != 0) MEM_STATUS_ON_SIMPLE(env->cap_history, num); env->num_named++; } else { return ONIGERR_UNDEFINED_GROUP_OPTION; } } break; case '~': if (IS_SYNTAX_OP2(env->syntax, ONIG_SYN_OP2_QMARK_TILDE_ABSENT_GROUP)) { Node* absent; Node* expr; int head_bar; int is_range_cutter; if (PEND) return ONIGERR_END_PATTERN_IN_GROUP; if (PPEEK_IS('|')) { /* (?~|generator|absent) */ PINC; if (PEND) return ONIGERR_END_PATTERN_IN_GROUP; head_bar = 1; if (PPEEK_IS(')')) { /* (?~|) : range clear */ PINC; r = make_range_clear(np, env); if (r != 0) return r; goto end; } } else head_bar = 0; r = fetch_token(tok, &p, end, env); if (r < 0) return r; r = prs_alts(&absent, tok, term, &p, end, env, TRUE); if (r < 0) { onig_node_free(absent); return r; } expr = NULL_NODE; is_range_cutter = 0; if (head_bar != 0) { Node* top = absent; if (NODE_TYPE(top) != NODE_ALT || IS_NULL(NODE_CDR(top))) { expr = NULL_NODE; is_range_cutter = 1; /* return ONIGERR_INVALID_ABSENT_GROUP_GENERATOR_PATTERN; */ } else { absent = NODE_CAR(top); expr = NODE_CDR(top); NODE_CAR(top) = NULL_NODE; NODE_CDR(top) = NULL_NODE; onig_node_free(top); if (IS_NULL(NODE_CDR(expr))) { top = expr; expr = NODE_CAR(top); NODE_CAR(top) = NULL_NODE; onig_node_free(top); } } } r = make_absent_tree(np, absent, expr, is_range_cutter, env); if (r != 0) { return r; } goto end; } else { return ONIGERR_UNDEFINED_GROUP_OPTION; } break; #ifdef USE_CALLOUT case '{': if (! IS_SYNTAX_OP2(env->syntax, ONIG_SYN_OP2_QMARK_BRACE_CALLOUT_CONTENTS)) return ONIGERR_UNDEFINED_GROUP_OPTION; r = prs_callout_of_contents(np, ')', &p, end, env); if (r != 0) return r; goto end; break; #endif case '(': /* (?()...) */ if (IS_SYNTAX_OP2(env->syntax, ONIG_SYN_OP2_QMARK_LPAREN_IF_ELSE)) { UChar *prev; Node* condition; int condition_is_checker; if (PEND) return ONIGERR_END_PATTERN_IN_GROUP; PFETCH(c); if (PEND) return ONIGERR_END_PATTERN_IN_GROUP; if (IS_CODE_DIGIT_ASCII(enc, c) || c == '-' || c == '+' || c == '<' || c == '\'') { #ifdef USE_BACKREF_WITH_LEVEL int exist_level; int level; #endif UChar* name_end; int back_num; enum REF_NUM num_type; int is_enclosed; is_enclosed = (c == '<' || c == '\'') ? 1 : 0; if (! is_enclosed) PUNFETCH; prev = p; #ifdef USE_BACKREF_WITH_LEVEL exist_level = 0; name_end = NULL_UCHARP; /* no need. escape gcc warning. */ r = fetch_name_with_level( (OnigCodePoint )(is_enclosed != 0 ? c : '('), &p, end, &name_end, env, &back_num, &level, &num_type); if (r == 1) exist_level = 1; #else r = fetch_name((OnigCodePoint )(is_enclosed != 0 ? c : '('), &p, end, &name_end, env, &back_num, &num_type, TRUE); #endif if (r < 0) { if (is_enclosed == 0) { goto any_condition; } else return r; } condition_is_checker = 1; if (num_type != IS_NOT_NUM) { if (num_type == IS_REL_NUM) { back_num = backref_rel_to_abs(back_num, env); } if (back_num <= 0) return ONIGERR_INVALID_BACKREF; if (IS_SYNTAX_BV(env->syntax, ONIG_SYN_STRICT_CHECK_BACKREF)) { if (back_num > env->num_mem || IS_NULL(PARSEENV_MEMENV(env)[back_num].mem_node)) return ONIGERR_INVALID_BACKREF; } condition = node_new_backref_checker(1, &back_num, FALSE, #ifdef USE_BACKREF_WITH_LEVEL exist_level, level, #endif env); } else { int num; int* backs; num = name_to_group_numbers(env, prev, name_end, &backs); if (num <= 0) { return ONIGERR_UNDEFINED_NAME_REFERENCE; } if (IS_SYNTAX_BV(env->syntax, ONIG_SYN_STRICT_CHECK_BACKREF)) { int i; for (i = 0; i < num; i++) { if (backs[i] > env->num_mem || IS_NULL(PARSEENV_MEMENV(env)[backs[i]].mem_node)) return ONIGERR_INVALID_BACKREF; } } condition = node_new_backref_checker(num, backs, TRUE, #ifdef USE_BACKREF_WITH_LEVEL exist_level, level, #endif env); } if (is_enclosed != 0) { if (PEND) goto err_if_else; PFETCH(c); if (c != ')') goto err_if_else; } } #ifdef USE_CALLOUT else if (c == '?') { if (IS_SYNTAX_OP2(env->syntax, ONIG_SYN_OP2_QMARK_BRACE_CALLOUT_CONTENTS)) { if (! PEND && PPEEK_IS('{')) { /* condition part is callouts of contents: (?(?{...})THEN|ELSE) */ condition_is_checker = 0; PFETCH(c); r = prs_callout_of_contents(&condition, ')', &p, end, env); if (r != 0) return r; goto end_condition; } } goto any_condition; } else if (c == '*' && IS_SYNTAX_OP2(env->syntax, ONIG_SYN_OP2_ASTERISK_CALLOUT_NAME)) { condition_is_checker = 0; r = prs_callout_of_name(&condition, ')', &p, end, env); if (r != 0) return r; goto end_condition; } #endif else { any_condition: PUNFETCH; condition_is_checker = 0; r = fetch_token(tok, &p, end, env); if (r < 0) return r; r = prs_alts(&condition, tok, term, &p, end, env, FALSE); if (r < 0) { onig_node_free(condition); return r; } } #ifdef USE_CALLOUT end_condition: #endif CHECK_NULL_RETURN_MEMERR(condition); if (PEND) { err_if_else: onig_node_free(condition); return ONIGERR_END_PATTERN_IN_GROUP; } if (PPEEK_IS(')')) { /* case: empty body: make backref checker */ if (condition_is_checker == 0) { onig_node_free(condition); return ONIGERR_INVALID_IF_ELSE_SYNTAX; } PFETCH(c); *np = condition; } else { /* if-else */ int then_is_empty; Node *Then, *Else; Then = 0; if (PPEEK_IS('|')) { PFETCH(c); then_is_empty = 1; } else then_is_empty = 0; r = fetch_token(tok, &p, end, env); if (r < 0) { onig_node_free(condition); return r; } r = prs_alts(&target, tok, term, &p, end, env, TRUE); if (r < 0) { onig_node_free(condition); onig_node_free(target); return r; } if (then_is_empty != 0) { Else = target; } else { if (NODE_TYPE(target) == NODE_ALT) { Then = NODE_CAR(target); if (NODE_CDR(NODE_CDR(target)) == NULL_NODE) { Else = NODE_CAR(NODE_CDR(target)); cons_node_free_alone(NODE_CDR(target)); } else { Else = NODE_CDR(target); } cons_node_free_alone(target); } else { Then = target; Else = 0; } } *np = node_new_bag_if_else(condition, Then, Else); if (IS_NULL(*np)) { onig_node_free(condition); onig_node_free(Then); onig_node_free(Else); return ONIGERR_MEMORY; } } goto end; } else { return ONIGERR_UNDEFINED_GROUP_OPTION; } break; #ifdef USE_CAPTURE_HISTORY case '@': if (IS_SYNTAX_OP2(env->syntax, ONIG_SYN_OP2_ATMARK_CAPTURE_HISTORY)) { if (IS_SYNTAX_OP2(env->syntax, ONIG_SYN_OP2_QMARK_LT_NAMED_GROUP)) { PFETCH(c); if (c == '<' || c == '\'') { list_capture = 1; goto named_group2; /* (?@...) */ } PUNFETCH; } *np = node_new_memory(0); CHECK_NULL_RETURN_MEMERR(*np); num = scan_env_add_mem_entry(env); if (num < 0) { return num; } else if (num >= (int )MEM_STATUS_BITS_NUM) { return ONIGERR_GROUP_NUMBER_OVER_FOR_CAPTURE_HISTORY; } BAG_(*np)->m.regnum = num; MEM_STATUS_ON_SIMPLE(env->cap_history, num); } else { return ONIGERR_UNDEFINED_GROUP_OPTION; } break; #endif #ifdef USE_WHOLE_OPTIONS case 'C': case 'I': case 'L': if (! IS_SYNTAX_BV(env->syntax, ONIG_SYN_WHOLE_OPTIONS)) return ONIGERR_UNDEFINED_GROUP_OPTION; goto options_start; break; #endif case 'P': if (IS_SYNTAX_OP2(env->syntax, ONIG_SYN_OP2_QMARK_CAPITAL_P_NAME)) { if (PEND) return ONIGERR_END_PATTERN_IN_GROUP; PFETCH(c); if (c == '<') goto named_group1; return ONIGERR_UNDEFINED_GROUP_OPTION; } /* else fall */ case 'W': case 'D': case 'S': case 'y': if (! IS_SYNTAX_OP2(env->syntax, ONIG_SYN_OP2_OPTION_ONIGURUMA)) return ONIGERR_UNDEFINED_GROUP_OPTION; /* else fall */ #ifdef USE_POSIXLINE_OPTION case 'p': #endif case 'a': case '-': case 'i': case 'm': case 's': case 'x': #ifdef USE_WHOLE_OPTIONS options_start: #endif { int neg; #ifdef USE_WHOLE_OPTIONS int whole_options; whole_options = FALSE; #endif neg = FALSE; while (1) { switch (c) { case ':': case ')': break; case '-': neg = TRUE; break; case 'x': OPTION_NEGATE(option, ONIG_OPTION_EXTEND, neg); break; case 'i': OPTION_NEGATE(option, ONIG_OPTION_IGNORECASE, neg); break; case 's': if (IS_SYNTAX_OP2(env->syntax, ONIG_SYN_OP2_OPTION_PERL)) { OPTION_NEGATE(option, ONIG_OPTION_MULTILINE, neg); } else return ONIGERR_UNDEFINED_GROUP_OPTION; break; case 'm': if (IS_SYNTAX_OP2(env->syntax, ONIG_SYN_OP2_OPTION_PERL)) { OPTION_NEGATE(option, ONIG_OPTION_SINGLELINE, (neg == FALSE ? TRUE : FALSE)); } else if (IS_SYNTAX_OP2(env->syntax, ONIG_SYN_OP2_OPTION_ONIGURUMA|ONIG_SYN_OP2_OPTION_RUBY)) { OPTION_NEGATE(option, ONIG_OPTION_MULTILINE, neg); } else return ONIGERR_UNDEFINED_GROUP_OPTION; break; #ifdef USE_POSIXLINE_OPTION case 'p': OPTION_NEGATE(option, ONIG_OPTION_MULTILINE|ONIG_OPTION_SINGLELINE, neg); break; #endif case 'W': if (! IS_SYNTAX_OP2(env->syntax, ONIG_SYN_OP2_OPTION_ONIGURUMA)) return ONIGERR_UNDEFINED_GROUP_OPTION; OPTION_NEGATE(option, ONIG_OPTION_WORD_IS_ASCII, neg); break; case 'D': if (! IS_SYNTAX_OP2(env->syntax, ONIG_SYN_OP2_OPTION_ONIGURUMA)) return ONIGERR_UNDEFINED_GROUP_OPTION; OPTION_NEGATE(option, ONIG_OPTION_DIGIT_IS_ASCII, neg); break; case 'S': if (! IS_SYNTAX_OP2(env->syntax, ONIG_SYN_OP2_OPTION_ONIGURUMA)) return ONIGERR_UNDEFINED_GROUP_OPTION; OPTION_NEGATE(option, ONIG_OPTION_SPACE_IS_ASCII, neg); break; case 'P': if (! IS_SYNTAX_OP2(env->syntax, ONIG_SYN_OP2_OPTION_ONIGURUMA)) return ONIGERR_UNDEFINED_GROUP_OPTION; OPTION_NEGATE(option, ONIG_OPTION_POSIX_IS_ASCII, neg); break; case 'y': /* y{g}, y{w} */ { if (! IS_SYNTAX_OP2(env->syntax, ONIG_SYN_OP2_OPTION_ONIGURUMA)) return ONIGERR_UNDEFINED_GROUP_OPTION; if (neg == TRUE) return ONIGERR_UNDEFINED_GROUP_OPTION; if (PEND) return ONIGERR_END_PATTERN_IN_GROUP; if (! PPEEK_IS('{')) return ONIGERR_UNDEFINED_GROUP_OPTION; PFETCH(c); if (PEND) return ONIGERR_END_PATTERN_IN_GROUP; PFETCH(c); switch (c) { case 'g': if (! ONIGENC_IS_UNICODE_ENCODING(enc)) return ONIGERR_UNDEFINED_GROUP_OPTION; OPTION_NEGATE(option, ONIG_OPTION_TEXT_SEGMENT_EXTENDED_GRAPHEME_CLUSTER, FALSE); OPTION_NEGATE(option, ONIG_OPTION_TEXT_SEGMENT_WORD, TRUE); break; #ifdef USE_UNICODE_WORD_BREAK case 'w': if (! ONIGENC_IS_UNICODE_ENCODING(enc)) return ONIGERR_UNDEFINED_GROUP_OPTION; OPTION_NEGATE(option, ONIG_OPTION_TEXT_SEGMENT_WORD, FALSE); OPTION_NEGATE(option, ONIG_OPTION_TEXT_SEGMENT_EXTENDED_GRAPHEME_CLUSTER, TRUE); break; #endif default: return ONIGERR_UNDEFINED_GROUP_OPTION; break; } if (PEND) return ONIGERR_END_PATTERN_IN_GROUP; PFETCH(c); if (c != '}') return ONIGERR_UNDEFINED_GROUP_OPTION; } /* case 'y' */ break; case 'a': if (! IS_SYNTAX_BV(env->syntax, ONIG_SYN_PYTHON)) return ONIGERR_UNDEFINED_GROUP_OPTION; OPTION_NEGATE(option, ONIG_OPTION_POSIX_IS_ASCII, neg); break; #ifdef USE_WHOLE_OPTIONS case 'C': if (! IS_SYNTAX_BV(env->syntax, ONIG_SYN_WHOLE_OPTIONS)) return ONIGERR_UNDEFINED_GROUP_OPTION; if (neg == TRUE) return ONIGERR_INVALID_GROUP_OPTION; OPTION_NEGATE(option, ONIG_OPTION_DONT_CAPTURE_GROUP, neg); whole_options = TRUE; break; case 'I': if (! IS_SYNTAX_BV(env->syntax, ONIG_SYN_WHOLE_OPTIONS)) return ONIGERR_UNDEFINED_GROUP_OPTION; if (neg == TRUE) return ONIGERR_INVALID_GROUP_OPTION; OPTION_NEGATE(option, ONIG_OPTION_IGNORECASE_IS_ASCII, neg); whole_options = TRUE; break; case 'L': if (! IS_SYNTAX_BV(env->syntax, ONIG_SYN_WHOLE_OPTIONS)) return ONIGERR_UNDEFINED_GROUP_OPTION; if (neg == TRUE) return ONIGERR_INVALID_GROUP_OPTION; OPTION_NEGATE(option, ONIG_OPTION_FIND_LONGEST, neg); whole_options = TRUE; break; #endif default: return ONIGERR_UNDEFINED_GROUP_OPTION; } if (c == ')') { *np = node_new_option(option); CHECK_NULL_RETURN_MEMERR(*np); r = 2; /* option only */ #ifdef USE_WHOLE_OPTIONS set_whole_options: if (whole_options == TRUE) { if ((env->flags & PE_FLAG_HAS_WHOLE_OPTIONS) != 0) return ONIGERR_INVALID_GROUP_OPTION; env->flags |= PE_FLAG_HAS_WHOLE_OPTIONS; NODE_STATUS_ADD(*np, WHOLE_OPTIONS); if (OPTON_DONT_CAPTURE_GROUP(option)) { env->reg->options |= ONIG_OPTION_DONT_CAPTURE_GROUP; if ((option & (ONIG_OPTION_DONT_CAPTURE_GROUP|ONIG_OPTION_CAPTURE_GROUP)) == (ONIG_OPTION_DONT_CAPTURE_GROUP|ONIG_OPTION_CAPTURE_GROUP)) return ONIGERR_INVALID_COMBINATION_OF_OPTIONS; } if ((option & ONIG_OPTION_IGNORECASE_IS_ASCII) != 0) { env->reg->case_fold_flag &= ~(INTERNAL_ONIGENC_CASE_FOLD_MULTI_CHAR | ONIGENC_CASE_FOLD_TURKISH_AZERI); env->reg->case_fold_flag |= ONIGENC_CASE_FOLD_ASCII_ONLY; env->reg->options |= ONIG_OPTION_IGNORECASE_IS_ASCII; } if (OPTON_FIND_LONGEST(option)) { env->reg->options |= ONIG_OPTION_FIND_LONGEST; } } #endif *src = p; return r; } else if (c == ':') { OnigOptionType prev = env->options; env->options = option; r = fetch_token(tok, &p, end, env); if (r < 0) return r; r = prs_alts(&target, tok, term, &p, end, env, FALSE); env->options = prev; if (r < 0) { onig_node_free(target); return r; } *np = node_new_option(option); CHECK_NULL_RETURN_MEMERR(*np); NODE_BODY(*np) = target; #ifdef USE_WHOLE_OPTIONS if (whole_options == TRUE) { r = 0; goto set_whole_options; } #endif *src = p; return 0; } if (PEND) return ONIGERR_END_PATTERN_IN_GROUP; PFETCH(c); } /* while (1) */ } break; default: return ONIGERR_UNDEFINED_GROUP_OPTION; } } #ifdef USE_CALLOUT else if (c == '*' && IS_SYNTAX_OP2(env->syntax, ONIG_SYN_OP2_ASTERISK_CALLOUT_NAME)) { PINC; r = prs_callout_of_name(np, ')', &p, end, env); if (r != 0) return r; goto end; } #endif else { if (OPTON_DONT_CAPTURE_GROUP(env->options)) goto group; *np = node_new_memory(0); CHECK_NULL_RETURN_MEMERR(*np); num = scan_env_add_mem_entry(env); if (num < 0) return num; BAG_(*np)->m.regnum = num; } CHECK_NULL_RETURN_MEMERR(*np); r = fetch_token(tok, &p, end, env); if (r < 0) return r; r = prs_alts(&target, tok, term, &p, end, env, FALSE); if (r < 0) { onig_node_free(target); return r; } NODE_BODY(*np) = target; if (NODE_TYPE(*np) == NODE_BAG) { if (BAG_(*np)->type == BAG_MEMORY) { /* Don't move this to previous of prs_alts() */ r = scan_env_set_mem_node(env, BAG_(*np)->m.regnum, *np); if (r != 0) return r; } } end: *src = p; return 0; } static const char* PopularQStr[] = { "?", "*", "+", "??", "*?", "+?" }; static const char* ReduceQStr[] = { "", "", "*", "*?", "??", "+ and ??", "+? and ?" }; static int assign_quantifier_body(Node* qnode, Node* target, int group, ParseEnv* env) { QuantNode* qn; qn = QUANT_(qnode); if (qn->lower == 1 && qn->upper == 1) return 1; switch (NODE_TYPE(target)) { case NODE_STRING: if (group == 0) { if (str_node_can_be_split(target, env->enc)) { Node* n = str_node_split_last_char(target, env->enc); if (IS_NOT_NULL(n)) { NODE_BODY(qnode) = n; return 2; } } } break; case NODE_QUANT: { /* check redundant double repeat. */ /* verbose warn (?:.?)? etc... but not warn (.?)? etc... */ QuantNode* qnt = QUANT_(target); int nestq_num = quantifier_type_num(qn); int targetq_num = quantifier_type_num(qnt); #ifdef USE_WARNING_REDUNDANT_NESTED_REPEAT_OPERATOR if (targetq_num >= 0 && nestq_num >= 0 && IS_SYNTAX_BV(env->syntax, ONIG_SYN_WARN_REDUNDANT_NESTED_REPEAT)) { UChar buf[WARN_BUFSIZE]; switch(ReduceTypeTable[targetq_num][nestq_num]) { case RQ_ASIS: break; case RQ_DEL: if (onig_verb_warn != onig_null_warn) { onig_snprintf_with_pattern(buf, WARN_BUFSIZE, env->enc, env->pattern, env->pattern_end, (UChar* )"redundant nested repeat operator"); (*onig_verb_warn)((char* )buf); } goto warn_exit; break; default: if (onig_verb_warn != onig_null_warn) { onig_snprintf_with_pattern(buf, WARN_BUFSIZE, env->enc, env->pattern, env->pattern_end, (UChar* )"nested repeat operator %s and %s was replaced with '%s'", PopularQStr[targetq_num], PopularQStr[nestq_num], ReduceQStr[ReduceTypeTable[targetq_num][nestq_num]]); (*onig_verb_warn)((char* )buf); } goto warn_exit; break; } } warn_exit: #endif if (targetq_num >= 0 && nestq_num < 0) { if (targetq_num == 1 || targetq_num == 2) { /* * or + */ /* (?:a*){n,m}, (?:a+){n,m} => (?:a*){n,n}, (?:a+){n,n} */ if (! IS_INFINITE_REPEAT(qn->upper) && qn->upper > 1 && qn->greedy) { qn->upper = (qn->lower == 0 ? 1 : qn->lower); } } } else { int r; NODE_BODY(qnode) = target; r = onig_reduce_nested_quantifier(qnode); return r; } } break; default: break; } NODE_BODY(qnode) = target; return 0; } #ifndef CASE_FOLD_IS_APPLIED_INSIDE_NEGATIVE_CCLASS static int clear_not_flag_cclass(CClassNode* cc, OnigEncoding enc) { BBuf *tbuf; int r; if (IS_NCCLASS_NOT(cc)) { bitset_invert(cc->bs); if (! ONIGENC_IS_SINGLEBYTE(enc)) { r = not_code_range_buf(enc, cc->mbuf, &tbuf); if (r != 0) return r; bbuf_free(cc->mbuf); cc->mbuf = tbuf; } NCCLASS_CLEAR_NOT(cc); } return 0; } #endif /* CASE_FOLD_IS_APPLIED_INSIDE_NEGATIVE_CCLASS */ #define ADD_CODE_INTO_CC(cc, code, enc) do {\ if (ONIGENC_MBC_MINLEN(enc) > 1 || ONIGENC_CODE_TO_MBCLEN(enc, code) != 1) {\ add_code_range_to_buf(&((cc)->mbuf), code, code);\ }\ else {\ BITSET_SET_BIT((cc)->bs, code);\ }\ } while (0) extern int onig_new_cclass_with_code_list(Node** rnode, OnigEncoding enc, int n, OnigCodePoint codes[]) { int i; Node* node; CClassNode* cc; *rnode = NULL_NODE; node = node_new_cclass(); CHECK_NULL_RETURN_MEMERR(node); cc = CCLASS_(node); for (i = 0; i < n; i++) { ADD_CODE_INTO_CC(cc, codes[i], enc); } *rnode = node; return 0; } typedef struct { ParseEnv* env; CClassNode* cc; Node* alt_root; Node** ptail; } IApplyCaseFoldArg; static int i_apply_case_fold(OnigCodePoint from, OnigCodePoint to[], int to_len, void* arg) { IApplyCaseFoldArg* iarg; ParseEnv* env; OnigEncoding enc; CClassNode* cc; iarg = (IApplyCaseFoldArg* )arg; env = iarg->env; cc = iarg->cc; enc = env->enc; if (to_len == 1) { int is_in = onig_is_code_in_cc(enc, from, cc); #ifdef CASE_FOLD_IS_APPLIED_INSIDE_NEGATIVE_CCLASS if ((is_in != 0 && !IS_NCCLASS_NOT(cc)) || (is_in == 0 && IS_NCCLASS_NOT(cc))) { ADD_CODE_INTO_CC(cc, *to, enc); } #else if (is_in != 0) { if (ONIGENC_MBC_MINLEN(enc) > 1 || ONIGENC_CODE_TO_MBCLEN(enc, *to) != 1) { if (IS_NCCLASS_NOT(cc)) clear_not_flag_cclass(cc, enc); add_code_range(&(cc->mbuf), env, *to, *to); } else { if (IS_NCCLASS_NOT(cc)) { BITSET_CLEAR_BIT(cc->bs, *to); } else BITSET_SET_BIT(cc->bs, *to); } } #endif /* CASE_FOLD_IS_APPLIED_INSIDE_NEGATIVE_CCLASS */ } else { int r, i, len; UChar buf[ONIGENC_CODE_TO_MBC_MAXLEN]; if (onig_is_code_in_cc(enc, from, cc) #ifdef CASE_FOLD_IS_APPLIED_INSIDE_NEGATIVE_CCLASS && !IS_NCCLASS_NOT(cc) #endif ) { int n, j, m, index; Node* list_node; Node* ns[3]; n = 0; for (i = 0; i < to_len; i++) { OnigCodePoint code; Node* csnode; CClassNode* cs_cc; index = 0; if (ONIGENC_IS_UNICODE_ENCODING(enc) && (index = onigenc_unicode_fold1_key(&to[i])) >= 0) { csnode = node_new_cclass(); cs_cc = CCLASS_(csnode); if (IS_NULL(csnode)) { err_free_ns: for (j = 0; j < n; j++) onig_node_free(ns[j]); return ONIGERR_MEMORY; } m = FOLDS1_UNFOLDS_NUM(index); for (j = 0; j < m; j++) { code = FOLDS1_UNFOLDS(index)[j]; ADD_CODE_INTO_CC(cs_cc, code, enc); } ADD_CODE_INTO_CC(cs_cc, to[i], enc); ns[n++] = csnode; } else { len = ONIGENC_CODE_TO_MBC(enc, to[i], buf); if (n == 0 || NODE_TYPE(ns[n-1]) != NODE_STRING) { csnode = node_new_str(buf, buf + len); if (IS_NULL(csnode)) goto err_free_ns; if (index == 0) NODE_STATUS_ADD(csnode, IGNORECASE); else NODE_STRING_SET_CASE_EXPANDED(csnode); ns[n++] = csnode; } else { r = onig_node_str_cat(ns[n-1], buf, buf + len); if (r < 0) goto err_free_ns; } } } if (n == 1) list_node = ns[0]; else list_node = make_list(n, ns); *(iarg->ptail) = onig_node_new_alt(list_node, NULL_NODE); if (IS_NULL(*(iarg->ptail))) { onig_node_free(list_node); return ONIGERR_MEMORY; } iarg->ptail = &(NODE_CDR((*(iarg->ptail)))); } } return 0; } static int prs_exp(Node** np, PToken* tok, int term, UChar** src, UChar* end, ParseEnv* env, int group_head) { int r, len, group; Node* qn; Node** tp; unsigned int parse_depth; retry: group = 0; *np = NULL; if (tok->type == (enum TokenSyms )term) goto end_of_token; parse_depth = env->parse_depth; switch (tok->type) { case TK_ALT: case TK_EOT: end_of_token: *np = node_new_empty(); CHECK_NULL_RETURN_MEMERR(*np); return tok->type; break; case TK_SUBEXP_OPEN: r = prs_bag(np, tok, TK_SUBEXP_CLOSE, src, end, env); if (r < 0) return r; if (r == 1) { /* group */ if (group_head == 0) group = 1; else { Node* target = *np; *np = node_new_group(target); if (IS_NULL(*np)) { onig_node_free(target); return ONIGERR_MEMORY; } group = 2; } } else if (r == 2) { /* option only */ if (IS_SYNTAX_BV(env->syntax, ONIG_SYN_ISOLATED_OPTION_CONTINUE_BRANCH)) { env->options = BAG_(*np)->o.options; r = fetch_token(tok, src, end, env); if (r < 0) return r; onig_node_free(*np); goto retry; } else { Node* target; OnigOptionType prev = env->options; env->options = BAG_(*np)->o.options; r = fetch_token(tok, src, end, env); if (r < 0) return r; r = prs_alts(&target, tok, term, src, end, env, FALSE); env->options = prev; if (r < 0) { onig_node_free(target); return r; } NODE_BODY(*np) = target; } return tok->type; } break; case TK_SUBEXP_CLOSE: if (! IS_SYNTAX_BV(env->syntax, ONIG_SYN_ALLOW_UNMATCHED_CLOSE_SUBEXP)) return ONIGERR_UNMATCHED_CLOSE_PARENTHESIS; if (tok->escaped) goto tk_crude_byte; else goto tk_byte; break; case TK_STRING: tk_byte: { *np = node_new_str_with_options(tok->backp, *src, env->options); CHECK_NULL_RETURN_MEMERR(*np); while (1) { r = fetch_token(tok, src, end, env); if (r < 0) return r; if (r != TK_STRING) break; r = onig_node_str_cat(*np, tok->backp, *src); if (r < 0) return r; } string_end: tp = np; goto repeat; } break; case TK_CRUDE_BYTE: tk_crude_byte: { *np = node_new_str_crude_char(tok->u.byte, env->options); CHECK_NULL_RETURN_MEMERR(*np); len = 1; while (1) { if (len >= ONIGENC_MBC_MINLEN(env->enc)) { if (len == enclen(env->enc, STR_(*np)->s)) { r = fetch_token(tok, src, end, env); goto tk_crude_byte_end; } } r = fetch_token(tok, src, end, env); if (r < 0) return r; if (r != TK_CRUDE_BYTE) return ONIGERR_TOO_SHORT_MULTI_BYTE_STRING; r = node_str_cat_char(*np, tok->u.byte); if (r < 0) return r; len++; } tk_crude_byte_end: if (! ONIGENC_IS_VALID_MBC_STRING(env->enc, STR_(*np)->s, STR_(*np)->end)) return ONIGERR_INVALID_WIDE_CHAR_VALUE; NODE_STRING_CLEAR_CRUDE(*np); goto string_end; } break; case TK_CODE_POINT: { UChar buf[ONIGENC_CODE_TO_MBC_MAXLEN]; len = ONIGENC_CODE_TO_MBCLEN(env->enc, tok->u.code); if (len < 0) return len; len = ONIGENC_CODE_TO_MBC(env->enc, tok->u.code, buf); #ifdef NUMBERED_CHAR_IS_NOT_CASE_AMBIG *np = node_new_str_crude(buf, buf + len, env->options); #else *np = node_new_str_with_options(buf, buf + len, env->options); #endif CHECK_NULL_RETURN_MEMERR(*np); } break; case TK_QUOTE_OPEN: { OnigCodePoint end_op[2]; UChar *qstart, *qend, *nextp; end_op[0] = (OnigCodePoint )MC_ESC(env->syntax); end_op[1] = (OnigCodePoint )'E'; qstart = *src; qend = find_str_position(end_op, 2, qstart, end, &nextp, env->enc); if (IS_NULL(qend)) { nextp = qend = end; } *np = node_new_str_with_options(qstart, qend, env->options); CHECK_NULL_RETURN_MEMERR(*np); *src = nextp; } break; case TK_CHAR_TYPE: { switch (tok->u.prop.ctype) { case ONIGENC_CTYPE_WORD: *np = node_new_ctype(tok->u.prop.ctype, tok->u.prop.not, env->options); CHECK_NULL_RETURN_MEMERR(*np); break; case ONIGENC_CTYPE_SPACE: case ONIGENC_CTYPE_DIGIT: case ONIGENC_CTYPE_XDIGIT: { CClassNode* cc; *np = node_new_cclass(); CHECK_NULL_RETURN_MEMERR(*np); cc = CCLASS_(*np); r = add_ctype_to_cc(cc, tok->u.prop.ctype, FALSE, env); if (r != 0) { onig_node_free(*np); *np = NULL_NODE; return r; } if (tok->u.prop.not != 0) NCCLASS_SET_NOT(cc); } break; default: return ONIGERR_PARSER_BUG; break; } } break; case TK_CHAR_PROPERTY: r = prs_char_property(np, tok, src, end, env); if (r != 0) return r; break; case TK_OPEN_CC: { CClassNode* cc; r = prs_cc(np, tok, src, end, env); if (r != 0) return r; cc = CCLASS_(*np); if (OPTON_IGNORECASE(env->options)) { IApplyCaseFoldArg iarg; iarg.env = env; iarg.cc = cc; iarg.alt_root = NULL_NODE; iarg.ptail = &(iarg.alt_root); r = ONIGENC_APPLY_ALL_CASE_FOLD(env->enc, env->case_fold_flag, i_apply_case_fold, &iarg); if (r != 0) { onig_node_free(iarg.alt_root); return r; } if (IS_NOT_NULL(iarg.alt_root)) { Node* work = onig_node_new_alt(*np, iarg.alt_root); if (IS_NULL(work)) { onig_node_free(iarg.alt_root); return ONIGERR_MEMORY; } *np = work; } } } break; case TK_ANYCHAR: *np = node_new_anychar(env->options); CHECK_NULL_RETURN_MEMERR(*np); break; case TK_ANYCHAR_ANYTIME: *np = node_new_anychar(env->options); CHECK_NULL_RETURN_MEMERR(*np); qn = node_new_quantifier(0, INFINITE_REPEAT, FALSE); CHECK_NULL_RETURN_MEMERR(qn); NODE_BODY(qn) = *np; *np = qn; break; case TK_BACKREF: len = tok->u.backref.num; *np = node_new_backref(len, (len > 1 ? tok->u.backref.refs : &(tok->u.backref.ref1)), tok->u.backref.by_name, #ifdef USE_BACKREF_WITH_LEVEL tok->u.backref.exist_level, tok->u.backref.level, #endif env); CHECK_NULL_RETURN_MEMERR(*np); break; #ifdef USE_CALL case TK_CALL: { int gnum = tok->u.call.gnum; *np = node_new_call(tok->u.call.name, tok->u.call.name_end, gnum, tok->u.call.by_number); CHECK_NULL_RETURN_MEMERR(*np); env->num_call++; if (tok->u.call.by_number != 0 && gnum == 0) { env->flags |= PE_FLAG_HAS_CALL_ZERO; } } break; #endif case TK_ANCHOR: *np = node_new_anchor_with_options(tok->u.anchor, env->options); CHECK_NULL_RETURN_MEMERR(*np); break; case TK_REPEAT: case TK_INTERVAL: if (IS_SYNTAX_BV(env->syntax, ONIG_SYN_CONTEXT_INDEP_REPEAT_OPS)) { if (IS_SYNTAX_BV(env->syntax, ONIG_SYN_CONTEXT_INVALID_REPEAT_OPS)) return ONIGERR_TARGET_OF_REPEAT_OPERATOR_NOT_SPECIFIED; else { *np = node_new_empty(); CHECK_NULL_RETURN_MEMERR(*np); } } else { goto tk_byte; } break; case TK_KEEP: r = node_new_keep(np, env); if (r < 0) return r; break; case TK_GENERAL_NEWLINE: r = node_new_general_newline(np, env); if (r < 0) return r; break; case TK_NO_NEWLINE: r = node_new_no_newline(np, env); if (r < 0) return r; break; case TK_TRUE_ANYCHAR: r = node_new_true_anychar(np); if (r < 0) return r; break; case TK_TEXT_SEGMENT: r = make_text_segment(np, env); if (r < 0) return r; break; default: return ONIGERR_PARSER_BUG; break; } { tp = np; re_entry: r = fetch_token(tok, src, end, env); if (r < 0) return r; repeat: if (r == TK_REPEAT || r == TK_INTERVAL) { Node* target; if (is_invalid_quantifier_target(*tp)) return ONIGERR_TARGET_OF_REPEAT_OPERATOR_INVALID; INC_PARSE_DEPTH(parse_depth); qn = node_new_quantifier(tok->u.repeat.lower, tok->u.repeat.upper, r == TK_INTERVAL); CHECK_NULL_RETURN_MEMERR(qn); QUANT_(qn)->greedy = tok->u.repeat.greedy; if (group == 2) { target = node_drop_group(*tp); *tp = NULL_NODE; } else { target = *tp; } r = assign_quantifier_body(qn, target, group, env); if (r < 0) { onig_node_free(qn); *tp = NULL_NODE; return r; } if (tok->u.repeat.possessive != 0) { Node* en; en = node_new_bag(BAG_STOP_BACKTRACK); if (IS_NULL(en)) { onig_node_free(qn); return ONIGERR_MEMORY; } NODE_BODY(en) = qn; qn = en; } if (r == 0) { *tp = qn; } else if (r == 1) { /* x{1,1} ==> x */ onig_node_free(qn); *tp = target; } else if (r == 2) { /* split case: /abc+/ */ Node *tmp; *tp = node_new_list(*tp, NULL); if (IS_NULL(*tp)) { onig_node_free(qn); return ONIGERR_MEMORY; } tmp = NODE_CDR(*tp) = node_new_list(qn, NULL); if (IS_NULL(tmp)) { onig_node_free(qn); return ONIGERR_MEMORY; } tp = &(NODE_CAR(tmp)); } group = 0; goto re_entry; } } return r; } static int prs_branch(Node** top, PToken* tok, int term, UChar** src, UChar* end, ParseEnv* env, int group_head) { int r; Node *node, **headp; *top = NULL; INC_PARSE_DEPTH(env->parse_depth); r = prs_exp(&node, tok, term, src, end, env, group_head); if (r < 0) { onig_node_free(node); return r; } if (r == TK_EOT || r == term || r == TK_ALT) { *top = node; } else { *top = node_new_list(node, NULL); if (IS_NULL(*top)) { mem_err: onig_node_free(node); return ONIGERR_MEMORY; } headp = &(NODE_CDR(*top)); while (r != TK_EOT && r != term && r != TK_ALT) { r = prs_exp(&node, tok, term, src, end, env, FALSE); if (r < 0) { onig_node_free(node); return r; } if (NODE_TYPE(node) == NODE_LIST) { *headp = node; while (IS_NOT_NULL(NODE_CDR(node))) node = NODE_CDR(node); headp = &(NODE_CDR(node)); } else { *headp = node_new_list(node, NULL); if (IS_NULL(*headp)) goto mem_err; headp = &(NODE_CDR(*headp)); } } } DEC_PARSE_DEPTH(env->parse_depth); return r; } /* term_tok: TK_EOT or TK_SUBEXP_CLOSE */ static int prs_alts(Node** top, PToken* tok, int term, UChar** src, UChar* end, ParseEnv* env, int group_head) { int r; Node *node, **headp; OnigOptionType save_options; *top = NULL; INC_PARSE_DEPTH(env->parse_depth); save_options = env->options; r = prs_branch(&node, tok, term, src, end, env, group_head); if (r < 0) { onig_node_free(node); return r; } if (r == term) { *top = node; } else if (r == TK_ALT) { *top = onig_node_new_alt(node, NULL); if (IS_NULL(*top)) { onig_node_free(node); return ONIGERR_MEMORY; } headp = &(NODE_CDR(*top)); while (r == TK_ALT) { r = fetch_token(tok, src, end, env); if (r < 0) return r; r = prs_branch(&node, tok, term, src, end, env, FALSE); if (r < 0) { onig_node_free(node); return r; } *headp = onig_node_new_alt(node, NULL); if (IS_NULL(*headp)) { onig_node_free(node); onig_node_free(*top); return ONIGERR_MEMORY; } headp = &(NODE_CDR(*headp)); } if (tok->type != (enum TokenSyms )term) goto err; } else { onig_node_free(node); err: if (term == TK_SUBEXP_CLOSE) return ONIGERR_END_PATTERN_WITH_UNMATCHED_PARENTHESIS; else return ONIGERR_PARSER_BUG; } env->options = save_options; DEC_PARSE_DEPTH(env->parse_depth); return r; } static int prs_regexp(Node** top, UChar** src, UChar* end, ParseEnv* env) { int r; PToken tok; ptoken_init(&tok); r = fetch_token(&tok, src, end, env); if (r < 0) return r; r = prs_alts(top, &tok, TK_EOT, src, end, env, FALSE); if (r < 0) return r; return 0; } #ifdef USE_CALL static int make_call_zero_body(Node* node, ParseEnv* env, Node** rnode) { int r; Node* x = node_new_memory(0 /* 0: is not named */); CHECK_NULL_RETURN_MEMERR(x); NODE_BODY(x) = node; BAG_(x)->m.regnum = 0; r = scan_env_set_mem_node(env, 0, x); if (r != 0) { onig_node_free(x); return r; } *rnode = x; return 0; } #endif extern int onig_parse_tree(Node** root, const UChar* pattern, const UChar* end, regex_t* reg, ParseEnv* env) { int r; UChar* p; #ifdef USE_CALLOUT RegexExt* ext; #endif reg->string_pool = 0; reg->string_pool_end = 0; reg->num_mem = 0; reg->num_repeat = 0; reg->num_empty_check = 0; reg->repeat_range_alloc = 0; reg->repeat_range = (RepeatRange* )NULL; names_clear(reg); scan_env_clear(env); env->options = reg->options; env->case_fold_flag = reg->case_fold_flag; env->enc = reg->enc; env->syntax = reg->syntax; env->pattern = (UChar* )pattern; env->pattern_end = (UChar* )end; env->reg = reg; *root = NULL; if (! ONIGENC_IS_VALID_MBC_STRING(env->enc, pattern, end)) return ONIGERR_INVALID_WIDE_CHAR_VALUE; p = (UChar* )pattern; r = prs_regexp(root, &p, (UChar* )end, env); if (r != 0) return r; #ifdef USE_CALL if ((env->flags & PE_FLAG_HAS_CALL_ZERO) != 0) { Node* zero_node; r = make_call_zero_body(*root, env, &zero_node); if (r != 0) return r; *root = zero_node; } #endif reg->num_mem = env->num_mem; #ifdef USE_CALLOUT ext = reg->extp; if (IS_NOT_NULL(ext) && ext->callout_num > 0) { r = setup_ext_callout_list_values(reg); } #endif return r; } extern void onig_scan_env_set_error_string(ParseEnv* env, int ecode ARG_UNUSED, UChar* arg, UChar* arg_end) { env->error = arg; env->error_end = arg_end; }