From e25c754918ae26e8b9e68a47bc1af36248e91800 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?J=C3=B6rg=20Frings-F=C3=BCrst?= Date: Fri, 12 Jul 2019 09:18:14 +0200 Subject: New upstream version 6.9.2 --- doc/API | 4 +- doc/API.ja | 8 +- doc/RE | 123 +++-- doc/RE.ja | 94 ++-- doc/SYNTAX.md | 1069 ++++++++++++++++++++++++++++++++++++++ doc/UNICODE_PROPERTIES | 1345 ++++++++++++++++++++++++------------------------ 6 files changed, 1876 insertions(+), 767 deletions(-) create mode 100644 doc/SYNTAX.md (limited to 'doc') diff --git a/doc/API b/doc/API index 24b531a..2309e5e 100644 --- a/doc/API +++ b/doc/API @@ -1,4 +1,4 @@ -Oniguruma API Version 6.8.0 2018/03/13 +Oniguruma API Version 6.9.2 2019/03/25 #include @@ -92,6 +92,8 @@ Oniguruma API Version 6.8.0 2018/03/13 (alnum, alpha, blank, cntrl, digit, graph, lower, print, punct, space, upper, xdigit, word) + ONIG_OPTION_TEXT_SEGMENT_EXTENDED_GRAPHEME_CLUSTER Extended Grapheme Cluster mode + ONIG_OPTION_TEXT_SEGMENT_WORD Word mode 5 enc: character encoding. diff --git a/doc/API.ja b/doc/API.ja index 5226288..164d0b8 100644 --- a/doc/API.ja +++ b/doc/API.ja @@ -1,4 +1,4 @@ -鬼車インターフェース Version 6.8.0 2018/03/13 +鬼車インターフェース Version 6.9.2 2019/03/29 #include @@ -91,6 +91,8 @@ (alnum, alpha, blank, cntrl, digit, graph, lower, print, punct, space, upper, xdigit, word) + ONIG_OPTION_TEXT_SEGMENT_EXTENDED_GRAPHEME_CLUSTER 拡張書記素房モード + ONIG_OPTION_TEXT_SEGMENT_WORD 単語モード 5 enc: 文字エンコーディング @@ -325,8 +327,8 @@ 8 mp: マッチパラメタ値 (match_stack_limit, retry_limit_in_match) -# int onig_match(regex_t* reg, const UChar* str, const UChar* end, const UChar* at, - OnigRegion* region, OnigOptionType option) +# int onig_match(regex_t* reg, const UChar* str, const UChar* end, + const UChar* at, OnigRegion* region, OnigOptionType option) 文字列の指定位置でマッチングを行い、結果とマッチ領域を返す。 diff --git a/doc/RE b/doc/RE index 963d009..72957dd 100644 --- a/doc/RE +++ b/doc/RE @@ -1,4 +1,4 @@ -Oniguruma Regular Expressions Version 6.8.0 2018/07/26 +Oniguruma Regular Expressions Version 6.9.2 2019/03/29 syntax: ONIG_SYNTAX_ONIGURUMA (default) @@ -81,15 +81,23 @@ syntax: ONIG_SYNTAX_ONIGURUMA (default) \O true anychar (?m:.) (* original function) - \X Extended Grapheme Cluster (?>\O(?:\Y\O)*) + \X Text Segment \X === (?>\O(?:\Y\O)*) - \X doesn't check whether matching start position is boundary. - Write as \y\X if you want to ensure it. + The meaning of this operator changes depending on the setting of + the option (?y{..}). - Unicode case: - See [Unicode Standard Annex #29: http://unicode.org/reports/tr29/] + \X doesn't check whether matching start position is boundary or not. + Please write as \y\X if you want to ensure it. - Not Unicode: (?>\r\n|\O) + [Extended Grapheme Cluster mode] (default) + Unicode case: + See [Unicode Standard Annex #29: http://unicode.org/reports/tr29/] + + Not Unicode case: \X === (?>\r\n|\O) + + [Word mode] + Currently, this mode is supported in Unicode only. + See [Unicode Standard Annex #29: http://unicode.org/reports/tr29/] Character Property @@ -119,17 +127,17 @@ syntax: ONIG_SYNTAX_ONIGURUMA (default) ? 1 or 0 times * 0 or more times + 1 or more times - {n,m} at least n but no more than m times + {n,m} (n <= m) at least n but no more than m times {n,} at least n times {,n} at least 0 but no more than n times ({0,n}) {n} n times reluctant - ?? 1 or 0 times + ?? 0 or 1 times *? 0 or more times +? 1 or more times - {n,m}? at least n but not more than m times + {n,m}? (n <= m) at least n but not more than m times {n,}? at least n times {,n}? at least 0 but not more than n times (== {0,n}?) @@ -138,8 +146,10 @@ syntax: ONIG_SYNTAX_ONIGURUMA (default) ?+ 1 or 0 times *+ 0 or more times ++ 1 or more times + {n,m} (n > m) at least m but not more than n times - ({n,m}+, {n,}+, {n}+ are possessive op. in ONIG_SYNTAX_JAVA only) + {n,m}+, {n,}+, {n}+ are possessive operators in ONIG_SYNTAX_JAVA and + ONIG_SYNTAX_PERL only. ex. /a*+/ === /(?>a*)/ @@ -150,8 +160,6 @@ syntax: ONIG_SYNTAX_ONIGURUMA (default) $ end of the line \b word boundary \B non-word boundary - \y Extended Grapheme Cluster boundary - \Y Extended Grapheme Cluster non-boundary \A beginning of string \Z end of string, or before newline at the end @@ -160,6 +168,24 @@ syntax: ONIG_SYNTAX_ONIGURUMA (default) \K keep (keep start position of the result string) + \y Text Segment boundary + \Y Text Segment non-boundary + + The meaning of these operators(\y, \Y) changes depending on the setting + of the option (?y{..}). + + [Extended Grapheme Cluster mode] (default) + Unicode case: + See [Unicode Standard Annex #29: http://unicode.org/reports/tr29/] + + Not Unicode: + All positions except between \r and \n. + + [Word mode] + Currently, this mode is supported in Unicode only. + See [Unicode Standard Annex #29: http://unicode.org/reports/tr29/] + + 6. Character class @@ -221,20 +247,28 @@ syntax: ONIG_SYNTAX_ONIGURUMA (default) (?#...) comment - (?imxWDSP-imxWDSP:subexp) option on/off for subexp + (?imxWDSPy-imxWDSP:subexp) option on/off for subexp + + i: ignore case + m: multi-line (dot (.) also matches newline) + x: extended form + W: ASCII only word (\w, \p{Word}, [[:word:]]) + ASCII only word bound (\b) + D: ASCII only digit (\d, \p{Digit}, [[:digit:]]) + S: ASCII only space (\s, \p{Space}, [[:space:]]) + P: ASCII only POSIX properties (includes W,D,S) + (alnum, alpha, blank, cntrl, digit, graph, + lower, print, punct, space, upper, xdigit, word) + + y{?}: Text Segment mode + This option changes the meaning of \X, \y, \Y. + Currently, this option is supported in Unicode only. - i: ignore case - m: multi-line (dot (.) also matches newline) - x: extended form - W: ASCII only word (\w, \p{Word}, [[:word:]]) - ASCII only word bound (\b) - D: ASCII only digit (\d, \p{Digit}, [[:digit:]]) - S: ASCII only space (\s, \p{Space}, [[:space:]]) - P: ASCII only POSIX properties (includes W,D,S) - (alnum, alpha, blank, cntrl, digit, graph, - lower, print, punct, space, upper, xdigit, word) + y{g}: Extended Grapheme Cluster mode (default) + y{w}: Word mode + See [Unicode Standard Annex #29] - (?imxWDSP-imxWDSP) isolated option + (?imxWDSPy-imxWDSP) isolated option * It makes a group to the next ')' or end of the pattern. /ab(?i)c|def|gh/ == /ab(?i:c|def|gh)/ @@ -336,7 +370,7 @@ syntax: ONIG_SYNTAX_ONIGURUMA (default) else_exp can be omitted. Then it works as a backreference validity checker. - [ backreference validity checker ] (* original) + [ Backreference validity checker ] (* original) (?(n)), (?(-n)), (?(+n)), (?(n+level)) ... (?()), (?('-n')), (?(<+n>)) ... @@ -470,10 +504,15 @@ A-1. Syntax-dependent options A-2. Original extensions - + hexadecimal digit char type \h, \H - + named group (?...), (?'name'...) - + named backref \k - + subexp call \g, \g + + hexadecimal digit char type \h, \H + + true anychar \O + + text segment boundary \y, \Y + + backreference validity checker (?(...)) + + named group (?...), (?'name'...) + + named backref \k + + subexp call \g, \g + + absent expression (?~|...|...) + + absent stopper (?|...) A-3. Missing features compared with perl 5.8.0 @@ -528,28 +567,4 @@ A-4. Differences with Japanized GNU regex(version 0.12) of Ruby 1.8 /(?:()|())*\1\2/ =~ "" /(?:\1a|())*/ =~ "a" - -A-5. Features disabled in default syntax - - + capture history - - (?@...) and (?@...) - - ex. /(?@a)*/.match("aaa") ==> [<0-1>, <1-2>, <2-3>] - - see sample/listcap.c file. - - -A-6. Problems - - + Invalid encoding byte sequence is not checked. - - ex. UTF-8 - - * Invalid first byte is treated as a character. - /./u =~ "\xa3" - - * Incomplete byte sequence is not checked. - /\w+/ =~ "a\xf3\x8ec" - // END diff --git a/doc/RE.ja b/doc/RE.ja index b35a51e..c09e237 100644 --- a/doc/RE.ja +++ b/doc/RE.ja @@ -1,4 +1,4 @@ -鬼車 正規表現 Version 6.8.0 2018/07/26 +鬼車 正規表現 Version 6.9.2 2019/03/29 使用文法: ONIG_SYNTAX_ONIGURUMA (既定値) @@ -81,15 +81,22 @@ \O 真任意文字 (?m:.) (* 原作) - \X 拡張書記素房 (?>\O(?:\Y\O)*) + \X 文章区分 \X === (?>\O(?:\Y\O)*) - \Xは照合の開始位置が拡張書記素房の境界かどうかを確認しない。 + この演算子の意味は、オプション (?y{..})の設定によって変化する。 + + \Xは照合の開始位置が区分の境界かどうかを確認しない。 それを確実にしたければ、\y\Xと書けば良い。 - Unicodeの場合: - 参照 [Unicode Standard Annex #29: http://unicode.org/reports/tr29/] + [拡張書記素房-状態のとき] (デフォルト) + Unicodeの場合: + 参照 [Unicode Standard Annex #29: http://unicode.org/reports/tr29/] + + Unicode以外の場合: \X === (?>\r\n|\O) - Unicode以外の場合: (?>\r\n|\O) + [単語-状態のとき] + 現在、Unicodeしかサポートしていない。 + 参照 [Unicode Standard Annex #29: http://unicode.org/reports/tr29/] Character Property @@ -119,17 +126,17 @@ ? 一回または零回 * 零回以上 + 一回以上 - {n,m} n回以上m回以下 + {n,m} (n <= m) n回以上 かつ m回以下 {n,} n回以上 {,n} 零回以上n回以下 ({0,n}) {n} n回 無欲 - ?? 一回または零回 + ?? 零回または一回 *? 零回以上 +? 一回以上 - {n,m}? n回以上m回以下 + {n,m}? (n <= m) n回以上 かつ m回以下 {n,}? n回以上 {,n}? 零回以上n回以下 (== {0,n}?) @@ -138,8 +145,9 @@ ?+ 一回または零回 *+ 零回以上 ++ 一回以上 + {n,m} (n > m) m回以上 かつ n回以下 - ({n,m}+, {n,}+, {n}+ は、ONIG_SYNTAX_JAVAでのみ強欲な指定子) + {n,m}+, {n,}+, {n}+ は、ONIG_SYNTAX_JAVAとONIG_SYNTAX_PERLでのみ強欲な指定子 例. /a*+/ === /(?>a*)/ @@ -150,8 +158,6 @@ $ 行末 \b 単語境界 \B 非単語境界 - \y 拡張書記素房 境界 - \Y 拡張書記素房 非境界 \A 文字列先頭 \Z 文字列末尾、または文字列末尾の改行の直前 @@ -160,6 +166,23 @@ \K 保持 (結果の開始位置をこの位置に保つ) + \y 文章区分 境界 + \Y 文章区分 非境界 + + この演算子の意味は、オプション (?y{..})の設定によって変化する。 + + [拡張書記素房-状態のとき] (デフォルト) + Unicodeの場合: + 参照 [Unicode Standard Annex #29: http://unicode.org/reports/tr29/] + + Unicode以外の場合: + \rと\nの間を除く全ての位置 + + [単語-状態のとき] + 現在、Unicodeしかサポートしていない。 + 参照 [Unicode Standard Annex #29: http://unicode.org/reports/tr29/] + + 6. 文字集合 @@ -220,7 +243,7 @@ (?#...) 注釈 - (?imxWDSP-imxWDSP:式) 式オプション + (?imxWDSPy-imxWDSP:式) 式オプション i: 大文字小文字照合 m: 複数行 @@ -233,7 +256,14 @@ (alnum, alpha, blank, cntrl, digit, graph, lower, print, punct, space, upper, xdigit, word) - (?imxWDSP-imxWDSP) 孤立オプション + y{?}: 文章区分状態 + このオプションは\X, \y, \Yの意味を変更する。 + 現在このオプションはUnicodeでしかサポートしていない + y{g}: 拡張書記素房-状態 (デフォルト) + y{w}: 単語-状態 + 参照 [Unicode Standard Annex #29] + + (?imxWDSPy-imxWDSP) 孤立オプション * これは次の')'またはパターンの終わりまでのグループを形成する /ab(?i)c|def|gh/ == /ab(?i:c|def|gh)/ @@ -482,9 +512,14 @@ 補記 2. 独自拡張機能 + 16進数数字、非16進数字 \h, \H + + 真任意文字 \O + + 文章区分境界 \y, \Y + + 後方参照値有効性確認器 (?(...)) + 名前付き捕獲式集合 (?...), (?'name'...) + 名前指定後方参照 \k + 部分式呼出し \g, \g + + 不在式 (?~|...|...) + + 不在停止 (?|...) 補記 3. Perl 5.8.0と比較して存在しない機能 @@ -539,35 +574,4 @@ /(?:()|())*\1\2/ =~ "" /(?:\1a|())*/ =~ "a" - - -補記 5. 実装されているが、既定値では有効にしていない機能 - - + 捕獲履歴参照 - - (?@...) と (?@...) - - 例. /(?@a)*/.match("aaa") ==> [<0-1>, <1-2>, <2-3>] - - 使用方法は、sample/listcap.cを参照 - - 有効にしていない理由は、どの程度役に立つかはっきりしないため。 - - -補記 6. 問題点 - - + エンコーディングバイト値が適正な価かどうかのチェックは行なっていない。 - - 例: UTF-8 - - * 先頭バイトとして不正なバイトを一文字とみなす - /./u =~ "\xa3" - - * 不完全なバイトシーケンスのチェックをしない - /\w+/u =~ "a\xf3\x8ec" - - これを調べることは可能ではあるが、遅くなるので行なわない。 - - 文字列として、そのようなバイト列を指定した場合の動作は保証しない。 - 終り diff --git a/doc/SYNTAX.md b/doc/SYNTAX.md new file mode 100644 index 0000000..449f262 --- /dev/null +++ b/doc/SYNTAX.md @@ -0,0 +1,1069 @@ + +# Oniguruma syntax (operator) configuration + +_Documented for Oniguruma 6.9.2 (2019/03/28)_ + + +---------- + + +## Overview + +This document details how to configure Oniguruma's syntax, by describing the desired +syntax operators and behaviors in an instance of the OnigSyntaxType struct, just like +the built-in Oniguruma syntaxes do. + +Configuration operators are bit flags, and are broken into multiple groups, somewhat arbitrarily, +because Oniguruma takes its configuration as a trio of 32-bit `unsigned int` values, assigned as +the first three fields in an `OnigSyntaxType` struct: + +```C +typedef struct { + unsigned int op; + unsigned int op2; + unsigned int behavior; + OnigOptionType options; /* default option */ + OnigMetaCharTableType meta_char_table; +} OnigSyntaxType; +``` + +The first group of configuration flags (`op`) roughly corresponds to the +configuration for "basic regex." The second group (`op2`) roughly corresponds +to the configuration for "advanced regex." And the third group (`behavior`) +describes more-or-less what to do for broken input, bad input, or other corner-case +regular expressions whose meaning is not well-defined. These three groups of +flags are described in full below, and tables of their usages for various syntaxes +follow. + +The `options` field describes the default compile options to use if the caller does +not specify any options when invoking `onig_new()`. + +The `meta_char_table` field is used exclusively by the ONIG_SYN_OP_VARIABLE_META_CHARACTERS +option, which allows the various regex metacharacters, like `*` and `?`, to be replaced +with alternates (for example, SQL typically uses `%` instead of `.*` and `_` instead of `?`). + + +---------- + + +## Group One Flags (op) + + +This group contains "basic regex" constructs, features common to most regex systems. + + +### 0. ONIG_SYN_OP_VARIABLE_META_CHARACTERS + +_Set in: none_ + +Enables support for `onig_set_meta_char()`, which allows you to provide alternate +characters that will be used instead of the six special characters that are normally +these characters below: + + - `ONIG_META_CHAR_ESCAPE`: `\` + - `ONIG_META_CHAR_ANYCHAR`: `.` + - `ONIG_META_CHAR_ANYTIME`: `*` + - `ONIG_META_CHAR_ZERO_OR_ONE_TIME`: `?` + - `ONIG_META_CHAR_ONE_OR_MORE_TIME`: `+` + - `ONIG_META_CHAR_ANYCHAR_ANYTIME`: Equivalent in normal regex to `.*`, but supported + explicitly so that Oniguruma can support matching SQL `%` wildcards or shell `*` wildcards. + +If this flag is set, then the values defined using `onig_set_meta_char()` will be used; +if this flag is clear, then the default regex characters will be used instead, and +data set by `onig_set_meta_char()` will be ignored. + + +### 1. ONIG_SYN_OP_DOT_ANYCHAR (enable `.`) + +_Set in: PosixBasic, PosixExtended, Emacs, Grep, GnuRegex, Java, Perl, Perl_NG, Ruby, Oniguruma_ + +Enables support for the standard `.` metacharacter, meaning "any one character." You +usually want this flag on unless you have turned on `ONIG_SYN_OP_VARIABLE_META_CHARACTERS` +so that you can use a metacharacter other than `.` instead. + + +### 2. ONIG_SYN_OP_ASTERISK_ZERO_INF (enable `r*`) + +_Set in: PosixBasic, PosixExtended, Emacs, Grep, GnuRegex, Perl, Java, Perl_NG, Ruby, Oniguruma_ + +Enables support for the standard `r*` metacharacter, meaning "zero or more r's." +You usually want this flag set unless you have turned on `ONIG_SYN_OP_VARIABLE_META_CHARACTERS` +so that you can use a metacharacter other than `*` instead. + + +### 3. ONIG_SYN_OP_ESC_ASTERISK_ZERO_INF (enable `r\*`) + +_Set in: none_ + +Enables support for an escaped `r\*` metacharacter, meaning "zero or more r's." This is +useful if you have disabled support for the normal `r*` metacharacter because you want `*` +to simply match a literal `*` character, but you still want some way of activating "zero or more" +behavior. + + +### 4. ONIG_SYN_OP_PLUS_ONE_INF (enable `r+`) + +_Set in: PosixExtended, Emacs, GnuRegex, Perl, Java, Perl_NG, Ruby, Oniguruma_ + +Enables support for the standard `r+` metacharacter, meaning "one or more r's." +You usually want this flag set unless you have turned on `ONIG_SYN_OP_VARIABLE_META_CHARACTERS` +so that you can use a metacharacter other than `+` instead. + + +### 5. ONIG_SYN_OP_ESC_PLUS_ONE_INF (enable `r\+`) + +_Set in: Grep_ + +Enables support for an escaped `r\+` metacharacter, meaning "one or more r's." This is +useful if you have disabled support for the normal `r+` metacharacter because you want `+` +to simply match a literal `+` character, but you still want some way of activating "one or more" +behavior. + + +### 6. ONIG_SYN_OP_QMARK_ZERO_ONE (enable `r?`) + +_Set in: PosixExtended, Emacs, GnuRegex, Perl, Java, Perl_NG, Ruby, Oniguruma_ + +Enables support for the standard `r?` metacharacter, meaning "zero or one r" or "an optional r." +You usually want this flag set unless you have turned on `ONIG_SYN_OP_VARIABLE_META_CHARACTERS` +so that you can use a metacharacter other than `?` instead. + + +### 7. ONIG_SYN_OP_ESC_QMARK_ZERO_ONE (enable `r\?`) + +_Set in: Grep_ + +Enables support for an escaped `r\?` metacharacter, meaning "zero or one r" or "an optional +r." This is useful if you have disabled support for the normal `r?` metacharacter because +you want `?` to simply match a literal `?` character, but you still want some way of activating +"optional" behavior. + + +### 8. ONIG_SYN_OP_BRACE_INTERVAL (enable `r{l,u}`) + +_Set in: PosixExtended, GnuRegex, Perl, Java, Perl_NG, Ruby, Oniguruma_ + +Enables support for the `r{lower,upper}` range form, common to more advanced +regex engines, which lets you specify precisely a minimum and maximum range on how many r's +must match (and not simply "zero or more"). + +This form also allows `r{count}` to specify a precise count of r's that must match. + +This form also allows `r{lower,}` to be equivalent to `r{lower,infinity}`. + +If and only if the `ONIG_SYN_ALLOW_INTERVAL_LOW_ABBREV` behavior flag is set, +this form also allows `r{,upper}` to be equivalent to `r{0,upper}`; otherwise, +`r{,upper}` will be treated as an error. + + +### 9. ONIG_SYN_OP_ESC_BRACE_INTERVAL (enable `\{` and `\}`) + +_Set in: PosixBasic, Emacs, Grep_ + +Enables support for an escaped `r\{lower,upper\}` range form. This is useful if you +have disabled support for the normal `r{...}` range form and want curly braces to simply +match literal curly brace characters, but you still want some way of activating +"range" behavior. + + +### 10. ONIG_SYN_OP_VBAR_ALT (enable `r|s`) + +_Set in: PosixExtended, GnuRegex, Perl, Java, Perl_NG, Ruby, Oniguruma_ + +Enables support for the common `r|s` alternation operator. You usually want this +flag set. + + +### 11. ONIG_SYN_OP_ESC_VBAR_ALT (enable `\|`) + +_Set in: Emacs, Grep_ + +Enables support for an escaped `r\|s` alternation form. This is useful if you +have disabled support for the normal `r|s` alternation form and want `|` to simply +match a literal `|` character, but you still want some way of activating "alternate" behavior. + + +### 12. ONIG_SYN_OP_LPAREN_SUBEXP (enable `(r)`) + +_Set in: PosixExtended, GnuRegex, Perl, Java, Perl_NG, Ruby, Oniguruma_ + +Enables support for the common `(...)` grouping-and-capturing operators. You usually +want this flag set. + + +### 13. ONIG_SYN_OP_ESC_LPAREN_SUBEXP (enable `\(` and `\)`) + +_Set in: PosixBasic, Emacs, Grep_ + +Enables support for escaped `\(...\)` grouping-and-capturing operators. This is useful if you +have disabled support for the normal `(...)` grouping-and-capturing operators and want +parentheses to simply match literal parenthesis characters, but you still want some way of +activating "grouping" or "capturing" behavior. + + +### 14. ONIG_SYN_OP_ESC_AZ_BUF_ANCHOR (enable `\A` and `\Z` and `\z`) + +_Set in: GnuRegex, Perl, Java, Perl_NG, Ruby, Oniguruma_ + +Enables support for the anchors `\A` (start-of-string), `\Z` (end-of-string or +newline-at-end-of-string), and `\z` (end-of-string) escapes. + +(If the escape metacharacter has been changed from the default of `\`, this +option will recognize that metacharacter instead.) + + +### 15. ONIG_SYN_OP_ESC_CAPITAL_G_BEGIN_ANCHOR (enable `\G`) + +_Set in: GnuRegex, Perl, Java, Perl_NG, Ruby, Oniguruma_ + +Enables support for the special anchor `\G` (start-of-previous-match). + +(If the escape metacharacter has been changed from the default of `\`, this +option will recognize that metacharacter instead.) + +Note that `OnigRegex`/`regex_t` are not stateful objects, and do _not_ record +the location of the previous match. The `\G` flag uses the `start` parameter +explicitly passed to `onig_search()` (or `onig_search_with_param()` to determine +the "start of the previous match," so if the caller always passes the start of +the entire buffer as the function's `start` parameter, then `\G` will behave +exactly the same as `\A`. + + +### 16. ONIG_SYN_OP_DECIMAL_BACKREF (enable `\num`) + +_Set in: PosixBasic, PosixExtended, Emacs, Grep, GnuRegex, Perl, Java, Perl_NG, Ruby, Oniguruma_ + +Enables support for subsequent matches to back references to prior capture groups `(...)` using +the common `\num` syntax (like `\3`). + +If this flag is clear, then a numeric escape like `\3` will either be treated as a literal `3`, +or, if `ONIG_SYN_OP_ESC_OCTAL3` is set, will be treated as an octal character code `\3`. + +You usually want this enabled, and it is enabled by default in every built-in syntax. + + +### 17. ONIG_SYN_OP_BRACKET_CC (enable `[...]`) + +_Set in: PosixBasic, PosixExtended, Emacs, Grep, GnuRegex, Perl, Java, Perl_NG, Ruby, Oniguruma_ + +Enables support for recognizing character classes, like `[a-z]`. If this flag is not set, `[` +and `]` will be treated as ordinary literal characters instead of as metacharacters. + +You usually want this enabled, and it is enabled by default in every built-in syntax. + + +### 18. ONIG_SYN_OP_ESC_W_WORD (enable `\w` and `\W`) + +_Set in: Grep, GnuRegex, Perl, Java, Perl_NG, Ruby, Oniguruma_ + +Enables support for the common `\w` and `\W` shorthand forms. These match "word characters," +whose meaning varies depending on the encoding being used. + +In ASCII encoding, `\w` is equivalent to `[A-Za-z0-9_]`. + +In most other encodings, `\w` matches many more characters, including accented letters, Greek letters, +Cyrillic letters, Braille letters and numbers, Runic letters, Hebrew letters, Arabic letters and numerals, +Chinese Han ideographs, Japanese Katakana and Hiragana, Korean Hangul, and generally any symbol that +could qualify as a phonetic "letter" or counting "number" in any language. (Note that emoji are _not_ +considered "word characters.") + +`\W` always matches the opposite of whatever `\w` matches. + + +### 19. ONIG_SYN_OP_ESC_LTGT_WORD_BEGIN_END (enable `\<` and `\>`) + +_Set in: Grep, GnuRegex_ + +Enables support for the GNU-specific `\<` and `\>` word-boundary metacharacters. These work like +the `\b` word-boundary metacharacter, but only match at one end of the word or the other: `\<` +only matches at a transition from a non-word character to a word character (i.e., at the start +of a word), and `\>` only matches at a transition from a word character to a non-word character +(i.e., at the end of a word). + +Most regex syntaxes do _not_ support these metacharacters. + + +### 20. ONIG_SYN_OP_ESC_B_WORD_BOUND (enable `\b` and `\B`) + +_Set in: Grep, GnuRegex, Perl, Java, Perl_NG, Ruby, Oniguruma_ + +Enables support for the common `\b` and `\B` word-boundary metacharacters. The `\b` metacharacter +matches a zero-width position at a transition from word-characters to non-word-characters, or vice +versa. The `\B` metacharacter matches at all positions _not_ matched by `\b`. + +See details in `ONIG_SYN_OP_ESC_W_WORD` above for an explanation as to which characters +are considered "word characters." + + +### 21. ONIG_SYN_OP_ESC_S_WHITE_SPACE (enable `\s` and `\S`) + +_Set in: GnuRegex, Perl, Java, Perl_NG, Ruby, Oniguruma_ + +Enables support for the common `\s` and `\S` whitespace-matching metacharacters. + +The `\s` metacharacter in ASCII encoding is exactly equivalent to the character class +`[\t\n\v\f\r ]`, or characters codes 9 through 13 (inclusive), and 32. + +The `\s` metacharacter in Unicode is exactly equivalent to the character class +`[\t\n\v\f\r \x85\xA0\x1680\x2000-\x200A\x2028-\x2029\x202F\x205F\x3000]` — that is, it matches +the same as ASCII, plus U+0085 (next line), U+00A0 (nonbreaking space), U+1680 (Ogham space mark), +U+2000 (en quad) through U+200A (hair space) (this range includes several widths of Unicode spaces), +U+2028 (line separator) through U+2029 (paragraph separator), +U+202F (narrow no-break space), U+205F (medium mathematical space), and U+3000 (CJK ideographic space). + +All non-Unicode encodings are handled by converting their code points to the appropriate +Unicode-equivalent code points, and then matching according to Unicode rules. + +`\S` always matches any one character that is _not_ in the set matched by `\s`. + + +### 22. ONIG_SYN_OP_ESC_D_DIGIT (enable `\d` and `\D`) + +_Set in: GnuRegex, Perl, Java, Perl_NG, Ruby, Oniguruma_ + +Enables support for the common `\d` and `\D` digit-matching metacharacters. + +The `\d` metacharacter in ASCII encoding is exactly equivalent to the character class +`[0-9]`, or characters codes 48 through 57 (inclusive). + +The `\d` metacharacter in Unicode matches `[0-9]`, as well as digits in Arabic, Devanagari, +Bengali, Laotian, Mongolian, CJK fullwidth numerals, and many more. + +All non-Unicode encodings are handled by converting their code points to the appropriate +Unicode-equivalent code points, and then matching according to Unicode rules. + +`\D` always matches any one character that is _not_ in the set matched by `\d`. + + +### 23. ONIG_SYN_OP_LINE_ANCHOR (enable `^r` and `r$`) + +_Set in: Emacs, Grep, GnuRegex, Perl, Java, Perl_NG, Ruby, Oniguruma_ + +Enables support for the common `^` and `$` line-anchor metacharacters. + +In single-line mode, `^` matches the start of the input buffer, and `$` matches +the end of the input buffer. In multi-line mode, `^` matches if the preceding +character is `\n`; and `$` matches if the following character is `\n`. + +(Note that Oniguruma does not recognize other newline types: It only matches +`^` and `$` against `\n`: not `\r`, not `\r\n`, not the U+2028 line separator, +and not any other form.) + + +### 24. ONIG_SYN_OP_POSIX_BRACKET (enable POSIX `[:xxxx:]`) + +_Set in: PosixBasic, PosixExtended, Grep, GnuRegex, Perl, Java, Perl_NG, Ruby, Oniguruma_ + +Enables support for the POSIX `[:xxxx:]` character classes, like `[:alpha:]` and `[:digit:]`. +The supported POSIX character classes are `alnum`, `alpha`, `blank`, `cntrl`, `digit`, +`graph`, `lower`, `print`, `punct`, `space`, `upper`, `xdigit`, `ascii`, `word`. + + +### 25. ONIG_SYN_OP_QMARK_NON_GREEDY (enable `r??`, `r*?`, `r+?`, and `r{n,m}?`) + +_Set in: Perl, Java, Perl_NG, Ruby, Oniguruma_ + +Enables support for lazy (non-greedy) quantifiers: That is, if you append a `?` after +another quantifier such as `?`, `*`, `+`, or `{n,m}`, Oniguruma will try to match +as _little_ as possible instead of as _much_ as possible. + + +### 26. ONIG_SYN_OP_ESC_CONTROL_CHARS (enable `\n`, `\r`, `\t`, etc.) + +_Set in: PosixBasic, PosixExtended, Java, Perl, Perl_NG, Ruby, Oniguruma_ + +Enables support for C-style control-code escapes, like `\n` and `\r`. Specifically, +this recognizes `\a` (7), `\b` (8), `\t` (9), `\n` (10), `\f` (12), `\r` (13), and +`\e` (27). If ONIG_SYN_OP2_ESC_V_VTAB is enabled (see below), this also enables +support for recognizing `\v` as code point 11. + + +### 27. ONIG_SYN_OP_ESC_C_CONTROL (enable `\cx` control codes) + +_Set in: Java, Perl, Perl_NG, Ruby, Oniguruma_ + +Enables support for named control-code escapes, like `\cm` or `\cM` for code-point +13. In this shorthand form, control codes may be specified by `\c` (for "Control") +followed by an alphabetic letter, a-z or A-Z, indicating which code point to represent +(1 through 26). So `\cA` is code point 1, and `\cZ` is code point 26. + + +### 28. ONIG_SYN_OP_ESC_OCTAL3 (enable `\OOO` octal codes) + +_Set in: Java, Perl, Perl_NG, Ruby, Oniguruma_ + +Enables support for octal-style escapes of up to three digits, like `\1` for code +point 1, and `\177` for code point 127. Octal values greater than 255 will result +in an error message. + + +### 29. ONIG_SYN_OP_ESC_X_HEX2 (enable `\xHH` hex codes) + +_Set in: Java, Perl, Perl_NG, Ruby, Oniguruma_ + +Enables support for hexadecimal-style escapes of up to two digits, like `\x1` for code +point 1, and `\x7F` for code point 127. + + +### 30. ONIG_SYN_OP_ESC_X_BRACE_HEX8 (enable `\x{7HHHHHHH}` hex codes) + +_Set in: Perl, Perl_NG, Ruby, Oniguruma_ + +Enables support for brace-wrapped hexadecimal-style escapes of up to eight digits, +like `\x{1}` for code point 1, and `\x{FFFE}` for code point 65534. + + +### 31. ONIG_SYN_OP_ESC_O_BRACE_OCTAL (enable `\o{1OOOOOOOOOO}` octal codes) + +_Set in: Perl, Perl_NG, Ruby, Oniguruma_ + +Enables support for brace-wrapped octal-style escapes of up to eleven digits, +like `\o{1}` for code point 1, and `\o{177776}` for code point 65534. + +(New feature as of Oniguruma 6.3.) + + +---------- + + +## Group Two Flags (op2) + + +This group contains support for lesser-known regex syntax constructs. + + +### 0. ONIG_SYN_OP2_ESC_CAPITAL_Q_QUOTE (enable `\Q...\E`) + +_Set in: Java, Perl, Perl_NG_ + +Enables support for "quoted" parts of a pattern: Between `\Q` and `\E`, all +syntax parsing is turned off, so that metacharacters like `*` and `+` will no +longer be treated as metacharacters, and instead will be matched as literal +`*` and `+`, as if they had been escaped with `\*` and `\+`. + + +### 1. ONIG_SYN_OP2_QMARK_GROUP_EFFECT (enable `(?...)`) + +_Set in: Java, Perl, Perl_NG, Ruby, Oniguruma_ + +Enables support for the fairly-common `(?...)` grouping operator, which +controls precedence but which does _not_ capture its contents. + + +### 2. ONIG_SYN_OP2_OPTION_PERL (enable options `(?imsx)` and `(?-imsx)`) + +_Set in: Java, Perl, Perl_NG_ + +Enables support of regex options. (i,m,s,x) +The supported toggle-able options for this flag are: + + - `i` - Case-insensitivity + - `m` - Multi-line mode (`^` and `$` match at `\n` as well as start/end of buffer) + - `s` - Single-line mode (`.` can match `\n`) + - `x` - Extended pattern (free-formatting: whitespace will ignored) + + +### 3. ONIG_SYN_OP2_OPTION_RUBY (enable options `(?imx)` and `(?-imx)`) + +_Set in: Ruby, Oniguruma_ + +Enables support of regex options. (i,m,x) +The supported toggle-able options for this flag are: + + - `i` - Case-insensitivity + - `m` - Multi-line mode (`.` can match `\n`) + - `x` - Extended pattern (free-formatting: whitespace will ignored) + + +### 4. ONIG_SYN_OP2_PLUS_POSSESSIVE_REPEAT (enable `r?+`, `r*+`, and `r++`) + +_Set in: Ruby, Oniguruma_ + +Enables support for the _possessive_ quantifiers `?+`, `*+`, and `++`, which +work similarly to `?` and `*` and `+`, respectively, but which do not backtrack +after matching: Like the normal greedy quantifiers, they match as much as +possible, but they do not attempt to match _less_ than their maximum possible +extent if subsequent parts of the pattern fail to match. + + +### 5. ONIG_SYN_OP2_PLUS_POSSESSIVE_INTERVAL (enable `r{n,m}+`) + +_Set in: Java_ + +Enables support for the _possessive_ quantifier `{n,m}+`, which +works similarly to `{n,m}`, but which does not backtrack +after matching: Like the normal greedy quantifier, it matches as much as +possible, but it do not attempt to match _less_ than its maximum possible +extent if subsequent parts of the pattern fail to match. + + +### 6. ONIG_SYN_OP2_CCLASS_SET_OP (enable `&&` within `[...]`) + +_Set in: Java, Ruby, Oniguruma_ + +Enables support for character-class _intersection_. For example, with this +feature enabled, you can write `[a-z&&[^aeiou]]` to produce a character class +of only consonants, or `[\0-\37&&[^\n\r]]` to produce a character class of +all control codes _except_ newlines. + + +### 7. ONIG_SYN_OP2_QMARK_LT_NAMED_GROUP (enable named captures `(?...)`) + +_Set in: Perl_NG, Ruby, Oniguruma_ + +Enables support for _naming_ capture groups, so that instead of having to +refer to captures by position (like `\3` or `$3`), you can refer to them by names +(like `server` and `path`). This supports the Perl/Ruby naming syntaxes `(?...)` +and `(?'name'...)`, but not the Python `(?P...)` syntax. + + +### 8. ONIG_SYN_OP2_ESC_K_NAMED_BACKREF (enable named backreferences `\k`) + +_Set in: Perl_NG, Ruby, Oniguruma_ + +Enables support for substituted backreferences by name, not just by position. +This supports using `\k'name'` in addition to supporting `\k`. This also +supports an Oniguruma-specific extension that lets you specify the _distance_ of +the match, if the capture matched multiple times, by writing `\k` or +`\k`. + + +### 9. ONIG_SYN_OP2_ESC_G_SUBEXP_CALL (enable backreferences `\g` and `\g`) + +_Set in: Perl_NG, Ruby, Oniguruma_ + +Enables support for substituted backreferences by both name and position using +the same syntax. This supports using `\g'name'` and `\g'1'` in addition to +supporting `\g` and `\g<1>`. + + +### 10. ONIG_SYN_OP2_ATMARK_CAPTURE_HISTORY (enable `(?@...)` and `(?@...)`) + +_Set in: none_ + +Enables support for _capture history_, which can answer via the `onig_*capture*()` +functions exactly which captures were matched, how many times, and where in the +input they were matched, by placing `?@` in front of the capture. Per Oniguruma's +regex syntax documentation (appendix A-5): + +`/(?@a)*/.match("aaa")` ==> `[<0-1>, <1-2>, <2-3>]` + +This can require substantial memory, is primarily useful for debugging, and is not +enabled by default in any syntax. + + +### 11. ONIG_SYN_OP2_ESC_CAPITAL_C_BAR_CONTROL (enable `\C-x`) + +_Set in: Ruby, Oniguruma_ + +Enables support for Ruby legacy control-code escapes, like `\C-m` or `\C-M` for code-point +13. In this shorthand form, control codes may be specified by `\C-` (for "Control") +followed by a single character (or equivalent), indicating which code point to represent, +based on that character's lowest five bits. So, like `\c`, you can represent code-point +10 with `\C-j`, but you can also represent it with `\C-*` as well. + +See also ONIG_SYN_OP_ESC_C_CONTROL, which enables the more-common `\cx` syntax. + + +### 12. ONIG_SYN_OP2_ESC_CAPITAL_M_BAR_META (enable `\M-x`) + +_Set in: Ruby, Oniguruma_ + +Enables support for Ruby legacy meta-code escapes. When you write `\M-x`, Oniguruma +will match an `x` whose 8th bit is set (i.e., the character code of `x` will be or'ed +with `0x80`). So, for example, you can match `\x81` using `\x81`, or you can write +`\M-\1`. This is mostly useful when working with legacy 8-bit character encodings. + + +### 13. ONIG_SYN_OP2_ESC_V_VTAB (enable `\v` as vertical tab) + +_Set in: Java, Ruby, Oniguruma_ + +Enables support for a C-style `\v` escape code, meaning "vertical tab." If enabled, +`\v` will be equivalent to ASCII code point 11. + + +### 14. ONIG_SYN_OP2_ESC_U_HEX4 (enable `\uHHHH` for Unicode) + +_Set in: Java, Ruby, Oniguruma_ + +Enables support for a Java-style `\uHHHH` escape code for representing Unicode +code-points by number, using up to four hexadecimal digits (up to `\uFFFF`). So, +for example, `\u221E` will match an infinity symbol, `∞`. + +For code points larger than four digits, like the emoji `🚡` (aerial tramway, or code +point U+1F6A1), you must either represent the character directly using an encoding like +UTF-8, or you must enable support for ONIG_SYN_OP_ESC_X_BRACE_HEX8 or +ONIG_SYN_OP_ESC_O_BRACE_OCTAL, which support more than four digits. + +(New feature as of Oniguruma 6.7.) + + +### 15. ONIG_SYN_OP2_ESC_GNU_BUF_ANCHOR (enable ``\` `` and `\'` anchors) + +_Set in: Emacs_ + +This flag makes the ``\` `` and `\'` escapes function identically to +`\A` and `\z`, respectively (when ONIG_SYN_OP_ESC_AZ_BUF_ANCHOR is enabled). + +These anchor forms are very obscure, and rarely supported by other regex libraries. + + +### 16. ONIG_SYN_OP2_ESC_P_BRACE_CHAR_PROPERTY (enable `\p{...}` and `\P{...}`) + +_Set in: Java, Perl, Perl_NG, Ruby, Oniguruma_ + +Enables support for an alternate syntax for POSIX character classes; instead of +writing `[:alpha:]` when this is enabled, you can instead write `\p{alpha}`. + +See also ONIG_SYN_OP_POSIX_BRACKET for the classic POSIX form. + + +### 17. ONIG_SYN_OP2_ESC_P_BRACE_CIRCUMFLEX_NOT (enable `\p{^...}` and `\P{^...}`) + +_Set in: Perl, Perl_NG, Ruby, Oniguruma_ + +Enables support for an alternate syntax for POSIX character classes; instead of +writing `[:^alpha:]` when this is enabled, you can instead write `\p{^alpha}`. + +See also ONIG_SYN_OP_POSIX_BRACKET for the classic POSIX form. + + +### 18. ONIG_SYN_OP2_CHAR_PROPERTY_PREFIX_IS + +_(not presently used)_ + + +### 19. ONIG_SYN_OP2_ESC_H_XDIGIT (enable `\h` and `\H`) + +_Set in: Ruby, Oniguruma_ + +Enables support for the Ruby-specific shorthand `\h` and `\H` metacharacters. +Somewhat like `\d` matches decimal digits, `\h` matches hexadecimal digits — that is, +characters in `[0-9a-fA-F]`. + +`\H` matches the opposite of whatever `\h` matches. + + +### 20. ONIG_SYN_OP2_INEFFECTIVE_ESCAPE (disable `\`) + +_Set in: As-is_ + +If set, this disables all escape codes, shorthands, and metacharacters that start +with `\` (or whatever the configured escape character is), allowing `\` to be treated +as a literal `\`. + +You usually do not want this flag to be enabled. + + +### 21. ONIG_SYN_OP2_QMARK_LPAREN_IF_ELSE (enable `(?(...)then|else)`) + +_Set in: Perl, Perl_NG, Ruby, Oniguruma_ + +Enables support for conditional inclusion of subsequent regex patterns based on whether +a prior named or numbered capture matched, or based on whether a pattern will +match. This supports many different forms, including: + + - `(?()then|else)` - condition based on a capture by name. + - `(?('foo')then|else)` - condition based on a capture by name. + - `(?(3)then|else)` - condition based on a capture by number. + - `(?(+3)then|else)` - forward conditional to a future match, by relative position. + - `(?(-3)then|else)` - backward conditional to a prior match, by relative position. + - `(?(foo)then|else)` - this matches a pattern `foo`. (foo is any sub-expression) + +(New feature as of Oniguruma 6.5.) + + +### 22. ONIG_SYN_OP2_ESC_CAPITAL_K_KEEP (enable `\K`) + +_Set in: Perl, Perl_NG, Ruby, Oniguruma_ + +Enables support for `\K`, which excludes all content before it from the overall +regex match (i.e., capture #0). So, for example, pattern `foo\Kbar` would match +`foobar`, but capture #0 would only include `bar`. + +(New feature as of Oniguruma 6.5.) + + +### 23. ONIG_SYN_OP2_ESC_CAPITAL_R_GENERAL_NEWLINE (enable `\R`) + +_Set in: Perl, Perl_NG, Ruby, Oniguruma_ + +Enables support for `\R`, the "general newline" shorthand, which matches +`(\r\n|[\n\v\f\r\u0085\u2028\u2029])` (obviously, the Unicode values are cannot be +matched in ASCII encodings). + +(New feature as of Oniguruma 6.5.) + + +### 24. ONIG_SYN_OP2_ESC_CAPITAL_N_O_SUPER_DOT (enable `\N` and `\O`) + +_Set in: Perl, Perl_NG, Oniguruma_ + +Enables support for `\N` and `\O`. `\N` is "not a line break," which is much +like the standard `.` metacharacter, except that while `.` can be affected by +the single-line setting, `\N` always matches exactly one character that is not +one of the various line-break characters (like `\n` and `\r`). + +`\O` matches exactly one character, regardless of whether single-line or +multi-line mode are enabled or disabled. + +(New feature as of Oniguruma 6.5.) + + +### 25. ONIG_SYN_OP2_QMARK_TILDE_ABSENT_GROUP (enable `(?~...)`) + +_Set in: Ruby, Oniguruma_ + +Enables support for the `(?~r)` "absent operator" syntax, which matches +as much as possible as long as the result _doesn't_ match pattern `r`. This is +_not_ the same as negative lookahead or negative lookbehind. + +Among the most useful examples of this is `\/\*(?~\*\/)\*\/`, which matches +C-style comments by simply saying "starts with /*, ends with */, and _doesn't_ +contain a */ in between." + +A full explanation of this feature is complicated, but it is useful, and an +excellent article about it is [available on Medium](https://medium.com/rubyinside/the-new-absent-operator-in-ruby-s-regular-expressions-7c3ef6cd0b99). + +(New feature as of Oniguruma 6.5.) + + +### 26. ONIG_SYN_OP2_ESC_X_Y_TEXT_SEGMENT (enable `\X` and `\Y` and `\y`) + +_Set in: Perl, Perl_NG, Ruby, Oniguruma_ + +`\X` is another variation on `.`, designed to support Unicode, in that it matches +a full _grapheme cluster_. In Unicode, `à` can be encoded as one code point, +`U+00E0`, or as two, `U+0061 U+0300`. If those are further escaped using UTF-8, +the former becomes two bytes, and the latter becomes three. Unfortunately, `.` +would naively match only one or two bytes, depending on the encoding, and would +likely incorrectly match anything from just `a` to a broken half of a code point. +`\X` is designed to fix this: It matches the full `à`, no matter how `à` is +encoded or decomposed. + +`\y` matches a cluster boundary, i.e., a zero-width position between +graphemes, somewhat like `\b` matches boundaries between words. `\Y` matches +the _opposite_ of `\y`, that is, a zero-width position between code points in +the _middle_ of a grapheme. + +(New feature as of Oniguruma 6.6.) + + +### 27. ONIG_SYN_OP2_QMARK_PERL_SUBEXP_CALL (enable `(?R)` and `(?&name)`) + +_Set in: Perl_NG_ + +Enables support for substituted backreferences by both name and position using +Perl-5-specific syntax. This supports using `(?R3)` and `(?&name)` to reference +previous (and future) matches, similar to the more-common `\g<3>` and `\g` +backreferences. + +(New feature as of Oniguruma 6.7.) + + +### 28. ONIG_SYN_OP2_QMARK_BRACE_CALLOUT_CONTENTS (enable `(?{...})`) + +_Set in: Perl, Perl_NG, Oniguruma_ + +Enables support for Perl-style "callouts" — pattern substitutions that result from +invoking a callback method. When `(?{foo})` is reached in a pattern, the callback +function set in `onig_set_progress_callout()` will be invoked, and be able to perform +custom computation during the pattern match (and during backtracking). + +Full documentation for this advanced feature can be found in the Oniguruma +`docs/CALLOUT.md` file, with an example in `samples/callout.c`. + +(New feature as of Oniguruma 6.8.) + + +### 29. ONIG_SYN_OP2_ASTERISK_CALLOUT_NAME (enable `(*name)`) + +_Set in: Perl, Perl_NG, Oniguruma_ + +Enables support for Perl-style "callouts" — pattern substitutions that result from +invoking a callback method. When `(*foo)` is reached in a pattern, the callback +function set in `onig_set_callout_of_name()` will be invoked, passing the given name +`foo` to it, and it can perform custom computation during the pattern match (and +during backtracking). + +Full documentation for this advanced feature can be found in the Oniguruma +`docs/CALLOUT.md` file, with an example in `samples/callout.c`. + +(New feature as of Oniguruma 6.8.) + + +### 30. ONIG_SYN_OP2_OPTION_ONIGURUMA (enable options `(?imxWSDPy)` and `(?-imxWDSP)`) + +_Set in: Oniguruma_ + +Enables support of regex options. (i,m,x,W,S,D,P,y) + +(New feature as of Oniguruma 6.9.2) + + - `i` - Case-insensitivity + - `m` - Multi-line mode (`.` can match `\n`) + - `x` - Extended pattern (free-formatting: whitespace will ignored) + - `W` - ASCII only word. + - `D` - ASCII only digit. + - `S` - ASCII only space. + - `P` - ASCII only POSIX properties. (includes W,D,S) + +---------- + + +## Syntax Flags (syn) + + +This group contains rules to handle corner cases and constructs that are errors in +some syntaxes but not in others. + +### 0. ONIG_SYN_CONTEXT_INDEP_REPEAT_OPS (independent `?`, `*`, `+`, `{n,m}`) + +_Set in: PosixExtended, GnuRegex, Java, Perl, Perl_NG, Ruby, Oniguruma_ + +This flag specifies how to handle operators like `?` and `*` when they aren't +directly attached to an operand, as in `^*` or `(*)`: Are they an error, are +they discarded, or are they taken as literals? If this flag is clear, they +are taken as literals; otherwise, the ONIG_SYN_CONTEXT_INVALID_REPEAT_OPS flag +determines if they are errors or if they are discarded. + +### 1. ONIG_SYN_CONTEXT_INVALID_REPEAT_OPS (error or ignore independent operators) + +_Set in: PosixExtended, GnuRegex, Java, Perl, Perl_NG, Ruby, Oniguruma_ + +If ONIG_SYN_CONTEXT_INDEP_REPEAT_OPS is set, this flag controls what happens when +independent operators appear in a pattern: If this flag is set, then independent +operators produce an error message; if this flag is clear, then independent +operators are silently discarded. + +### 2. ONIG_SYN_ALLOW_UNMATCHED_CLOSE_SUBEXP (allow `...)...`) + +_Set in: PosixExtended_ + +This flag, if set, causes a `)` character without a preceding `(` to be treated as +a literal `)`, equivalent to `\)`. If this flag is clear, then an unmatched `)` +character will produce an error message. + +### 3. ONIG_SYN_ALLOW_INVALID_INTERVAL (allow `{???`) + +_Set in: GnuRegex, Java, Perl, Perl_NG, Ruby, Oniguruma_ + +This flag, if set, causes an invalid range, like `foo{bar}` or `foo{}`, to be +silently discarded, as if `foo` had been written instead. If clear, an invalid +range will produce an error message. + +### 4. ONIG_SYN_ALLOW_INTERVAL_LOW_ABBREV (allow `{,n}` to mean `{0,n}`) + +_Set in: Ruby, Oniguruma_ + +If this flag is set, then `r{,n}` will be treated as equivalent to writing +`{0,n}`. If this flag is clear, then `r{,n}` will produce an error message. + +Note that regardless of whether this flag is set or clear, if +ONIG_SYN_OP_BRACE_INTERVAL is enabled, then `r{n,}` will always be legal: This +flag *only* controls the behavior of the opposite form, `r{,n}`. + +### 5. ONIG_SYN_STRICT_CHECK_BACKREF (error on invalid backrefs) + +_Set in: none_ + +If this flag is set, an invalid backref, like `\1` in a pattern with no captures, +will produce an error. If this flag is clear, then an invalid backref will be +equivalent to the empty string. + +No built-in syntax has this flag enabled. + +### 6. ONIG_SYN_DIFFERENT_LEN_ALT_LOOK_BEHIND (allow `(?<=a|bc)`) + +_Set in: Java, Ruby, Oniguruma_ + +If this flag is set, lookbehind patterns with alternate options may have differing +lengths among those options. If this flag is clear, lookbehind patterns with options +must have each option have identical length to the other options. + +Oniguruma can handle either form, but not all regex engines can, so for compatibility, +Oniguruma allows you to cause regexes for other regex engines to fail if they might +depend on this rule. + +### 7. ONIG_SYN_CAPTURE_ONLY_NAMED_GROUP (prefer `\k` over `\3`) + +_Set in: Perl_NG, Ruby, Oniguruma_ + +If this flag is set on the syntax *and* ONIG_OPTION_CAPTURE_GROUP is set when calling +Oniguruma, then if a name is used on any capture, all captures must also use names: A +single use of a named capture prohibits the use of numbered captures. + +### 8. ONIG_SYN_ALLOW_MULTIPLEX_DEFINITION_NAME (allow `(?)...(?)`) + +_Set in: Perl_NG, Ruby, Oniguruma_ + +If this flag is set, multiple capture groups may use the same name. If this flag is +clear, then reuse of a name will produce an error message. + +### 9. ONIG_SYN_FIXED_INTERVAL_IS_GREEDY_ONLY (`a{n}?` is equivalent to `(?:a{n})?`) + +_Set in: Ruby, Oniguruma_ + +If this flag is set, then intervals of a fixed size will ignore a lazy (non-greedy) +`?` quantifier and treat it as an optional match (an ordinary `r?`), since "match as +little as possible" is meaningless for a fixed-size interval. If this flag is clear, +then `r{n}?` will mean the same as `r{n}`, and the useless `?` will be discarded. + +### 20. ONIG_SYN_NOT_NEWLINE_IN_NEGATIVE_CC (add `\n` to `[^...]`) + +_Set in: Grep_ + +If this flag is set, all newline characters (like `\n`) will be excluded from a negative +character class automatically, as if the pattern had been written as `[^...\n]`. If this +flag is clear, negative character classes do not automatically exclude newlines, and +only exclude those characters and ranges written in them. + +### 21. ONIG_SYN_BACKSLASH_ESCAPE_IN_CC (allow `[...\w...]`) + +_Set in: GnuRegex, Java, Perl, Perl_NG, Ruby, Oniguruma_ + +If this flag is set, shorthands like `\w` are allowed to describe characters in character +classes. If this flag is clear, shorthands like `\w` are treated as a redundantly-escaped +literal `w`. + +### 22. ONIG_SYN_ALLOW_EMPTY_RANGE_IN_CC (silently discard `[z-a]`) + +_Set in: Emacs, Grep_ + +If this flag is set, then character ranges like `[z-a]` that are broken or contain no +characters will be silently ignored. If this flag is clear, then broken or empty +character ranges will produce an error message. + +### 23. ONIG_SYN_ALLOW_DOUBLE_RANGE_OP_IN_CC (treat `[0-9-a]` as `[0-9\-a]`) + +_Set in: PosixExtended, GnuRegex, Java, Perl, Perl_NG, Ruby, Oniguruma_ + +If this flag is set, then a trailing `-` after a character range will be taken as a +literal `-`, as if it had been escaped as `\-`. If this flag is clear, then a trailing +`-` after a character range will produce an error message. + +### 24. ONIG_SYN_WARN_CC_OP_NOT_ESCAPED (warn on `[[...]` and `[-x]`) + +_Set in: Ruby, Oniguruma_ + +If this flag is set, Oniguruma will be stricter about warning for bad forms in +character classes: `[[...]` will produce a warning, but `[\[...]` will not; +`[-x]` will produce a warning, but `[\-x]` will not; `[x&&-y]` will produce a warning, +while `[x&&\-y]` will not; and so on. If this flag is clear, all of these warnings +will be silently discarded. + +### 25. ONIG_SYN_WARN_REDUNDANT_NESTED_REPEAT (warn on `(?:a*)+`) + +_Set in: Ruby, Oniguruma_ + +If this flag is set, Oniguruma will warn about nested repeat operators those have no meaning, like `(?:a*)+`. +If this flag is clear, Oniguruma will allow the nested repeat operators without warning about them. + +### 31. ONIG_SYN_CONTEXT_INDEP_ANCHORS + +_Set in: PosixExtended, GnuRegex, Java, Perl, Perl_NG, Ruby, Oniguruma_ + +Not currently used, and does nothing. (But still set in several syntaxes for some +reason.) + +---------- + +## Usage tables + +These tables show which of the built-in syntaxes use which flags and options, for easy comparison between them. + +### Group One Flags (op) + +| ID | Option | PosB | PosEx | Emacs | Grep | Gnu | Java | Perl | PeNG | Ruby | Onig | +| ----- | --------------------------------------------- | ----- | ----- | ----- | ----- | ----- | ----- | ----- | ----- | ----- | ----- | +| 0 | `ONIG_SYN_OP_VARIABLE_META_CHARACTERS` | - | - | - | - | - | - | - | - | - | - | +| 1 | `ONIG_SYN_OP_DOT_ANYCHAR` | Yes | Yes | Yes | Yes | Yes | Yes | Yes | Yes | Yes | Yes | +| 2 | `ONIG_SYN_OP_ASTERISK_ZERO_INF` | Yes | Yes | Yes | Yes | Yes | Yes | Yes | Yes | Yes | Yes | +| 3 | `ONIG_SYN_OP_ESC_ASTERISK_ZERO_INF` | - | - | - | - | - | - | - | - | - | - | +| 4 | `ONIG_SYN_OP_PLUS_ONE_INF` | - | Yes | Yes | - | Yes | Yes | Yes | Yes | Yes | Yes | +| 5 | `ONIG_SYN_OP_ESC_PLUS_ONE_INF` | - | - | - | Yes | - | - | - | - | - | - | +| 6 | `ONIG_SYN_OP_QMARK_ZERO_ONE` | - | Yes | Yes | - | Yes | Yes | Yes | Yes | Yes | Yes | +| 7 | `ONIG_SYN_OP_ESC_QMARK_ZERO_ONE` | - | - | - | Yes | - | - | - | - | - | - | +| 8 | `ONIG_SYN_OP_BRACE_INTERVAL` | - | Yes | - | - | Yes | Yes | Yes | Yes | Yes | Yes | +| 9 | `ONIG_SYN_OP_ESC_BRACE_INTERVAL` | Yes | - | Yes | Yes | - | - | - | - | - | - | +| 10 | `ONIG_SYN_OP_VBAR_ALT` | - | Yes | - | - | Yes | Yes | Yes | Yes | Yes | Yes | +| 11 | `ONIG_SYN_OP_ESC_VBAR_ALT` | - | - | Yes | Yes | - | - | - | - | - | - | +| 12 | `ONIG_SYN_OP_LPAREN_SUBEXP` | - | Yes | - | - | Yes | Yes | Yes | Yes | Yes | Yes | +| 13 | `ONIG_SYN_OP_ESC_LPAREN_SUBEXP` | Yes | - | Yes | Yes | - | - | - | - | - | - | +| 14 | `ONIG_SYN_OP_ESC_AZ_BUF_ANCHOR` | - | - | - | - | Yes | Yes | Yes | Yes | Yes | Yes | +| 15 | `ONIG_SYN_OP_ESC_CAPITAL_G_BEGIN_ANCHOR` | - | - | - | - | Yes | Yes | Yes | Yes | Yes | Yes | +| 16 | `ONIG_SYN_OP_DECIMAL_BACKREF` | Yes | Yes | Yes | Yes | Yes | Yes | Yes | Yes | Yes | Yes | +| 17 | `ONIG_SYN_OP_BRACKET_CC` | Yes | Yes | Yes | Yes | Yes | Yes | Yes | Yes | Yes | Yes | +| 18 | `ONIG_SYN_OP_ESC_W_WORD` | - | - | - | Yes | Yes | Yes | Yes | Yes | Yes | Yes | +| 19 | `ONIG_SYN_OP_ESC_LTGT_WORD_BEGIN_END` | - | - | - | Yes | Yes | - | - | - | - | - | +| 20 | `ONIG_SYN_OP_ESC_B_WORD_BOUND` | - | - | - | Yes | Yes | Yes | Yes | Yes | Yes | Yes | +| 21 | `ONIG_SYN_OP_ESC_S_WHITE_SPACE` | - | - | - | - | Yes | Yes | Yes | Yes | Yes | Yes | +| 22 | `ONIG_SYN_OP_ESC_D_DIGIT` | - | - | - | - | Yes | Yes | Yes | Yes | Yes | Yes | +| 23 | `ONIG_SYN_OP_LINE_ANCHOR` | - | - | Yes | Yes | Yes | Yes | Yes | Yes | Yes | Yes | +| 24 | `ONIG_SYN_OP_POSIX_BRACKET` | Yes | Yes | Yes | - | Yes | Yes | Yes | Yes | Yes | Yes | +| 25 | `ONIG_SYN_OP_QMARK_NON_GREEDY` | - | - | - | - | - | Yes | Yes | Yes | Yes | Yes | +| 26 | `ONIG_SYN_OP_ESC_CONTROL_CHARS` | Yes | Yes | - | - | - | Yes | Yes | Yes | Yes | Yes | +| 27 | `ONIG_SYN_OP_ESC_C_CONTROL` | - | - | - | - | - | Yes | Yes | Yes | Yes | Yes | +| 28 | `ONIG_SYN_OP_ESC_OCTAL3` | - | - | - | - | - | Yes | Yes | Yes | Yes | Yes | +| 29 | `ONIG_SYN_OP_ESC_X_HEX2` | - | - | - | - | - | Yes | Yes | Yes | Yes | Yes | +| 30 | `ONIG_SYN_OP_ESC_X_BRACE_HEX8` | - | - | - | - | - | - | Yes | Yes | Yes | Yes | +| 31 | `ONIG_SYN_OP_ESC_O_BRACE_OCTAL` | - | - | - | - | - | - | Yes | Yes | Yes | Yes | + +### Group Two Flags (op2) + +| ID | Option | PosB | PosEx | Emacs | Grep | Gnu | Java | Perl | PeNG | Ruby | Onig | +| ----- | --------------------------------------------- | ----- | ----- | ----- | ----- | ----- | ----- | ----- | ----- | ----- | ----- | +| 0 | `ONIG_SYN_OP2_ESC_CAPITAL_Q_QUOTE` | - | - | - | - | - | Yes | Yes | Yes | - | - | +| 1 | `ONIG_SYN_OP2_QMARK_GROUP_EFFECT` | - | - | - | - | - | Yes | Yes | Yes | Yes | Yes | +| 2 | `ONIG_SYN_OP2_OPTION_PERL` | - | - | - | - | - | Yes | Yes | Yes | - | - | +| 3 | `ONIG_SYN_OP2_OPTION_RUBY` | - | - | - | - | - | - | - | - | Yes | - | +| 4 | `ONIG_SYN_OP2_PLUS_POSSESSIVE_REPEAT` | - | - | - | - | - | - | - | - | Yes | Yes | +| 5 | `ONIG_SYN_OP2_PLUS_POSSESSIVE_INTERVAL` | - | - | - | - | - | Yes | - | - | - | - | +| 6 | `ONIG_SYN_OP2_CCLASS_SET_OP` | - | - | - | - | - | - | - | Yes | Yes | Yes | +| 7 | `ONIG_SYN_OP2_QMARK_LT_NAMED_GROUP` | - | - | - | - | - | - | - | Yes | Yes | Yes | +| 8 | `ONIG_SYN_OP2_ESC_K_NAMED_BACKREF` | - | - | - | - | - | - | - | Yes | Yes | Yes | +| 9 | `ONIG_SYN_OP2_ESC_G_SUBEXP_CALL` | - | - | - | - | - | - | - | Yes | Yes | Yes | +| 10 | `ONIG_SYN_OP2_ATMARK_CAPTURE_HISTORY` | - | - | - | - | - | - | - | - | - | - | +| 11 | `ONIG_SYN_OP2_ESC_CAPITAL_C_BAR_CONTROL` | - | - | - | - | - | - | - | - | Yes | Yes | +| 12 | `ONIG_SYN_OP2_ESC_CAPITAL_M_BAR_META` | - | - | - | - | - | - | - | - | Yes | Yes | +| 13 | `ONIG_SYN_OP2_ESC_V_VTAB` | - | - | - | - | - | Yes | - | - | Yes | Yes | +| 14 | `ONIG_SYN_OP2_ESC_U_HEX4` | - | - | - | - | - | Yes | - | - | Yes | Yes | +| 15 | `ONIG_SYN_OP2_ESC_GNU_BUF_ANCHOR` | - | - | Yes | - | - | - | - | - | - | - | +| 16 | `ONIG_SYN_OP2_ESC_P_BRACE_CHAR_PROPERTY` | - | - | - | - | - | Yes | Yes | Yes | Yes | Yes | +| 17 | `ONIG_SYN_OP2_ESC_P_BRACE_CIRCUMFLEX_NOT` | - | - | - | - | - | - | Yes | Yes | Yes | Yes | +| 18 | `ONIG_SYN_OP2_CHAR_PROPERTY_PREFIX_IS` | - | - | - | - | - | - | - | - | - | - | +| 19 | `ONIG_SYN_OP2_ESC_H_XDIGIT` | - | - | - | - | - | - | - | - | Yes | Yes | +| 20 | `ONIG_SYN_OP2_INEFFECTIVE_ESCAPE` | - | - | - | - | - | - | - | - | - | - | +| 21 | `ONIG_SYN_OP2_QMARK_LPAREN_IF_ELSE` | - | - | - | - | - | - | Yes | Yes | Yes | Yes | +| 22 | `ONIG_SYN_OP2_ESC_CAPITAL_K_KEEP` | - | - | - | - | - | - | Yes | Yes | Yes | Yes | +| 23 | `ONIG_SYN_OP2_ESC_CAPITAL_R_GENERAL_NEWLINE` | - | - | - | - | - | - | Yes | Yes | Yes | Yes | +| 24 | `ONIG_SYN_OP2_ESC_CAPITAL_N_O_SUPER_DOT` | - | - | - | - | - | - | Yes | Yes | - | Yes | +| 25 | `ONIG_SYN_OP2_QMARK_TILDE_ABSENT_GROUP` | - | - | - | - | - | - | - | - | Yes | Yes | +| 26 | `ONIG_SYN_OP2_ESC_X_Y_TEXT_SEGMENT` | - | - | - | - | - | - | Yes | Yes | Yes | Yes | +| 27 | `ONIG_SYN_OP2_QMARK_PERL_SUBEXP_CALL` | - | - | - | - | - | - | - | Yes | - | - | +| 28 | `ONIG_SYN_OP2_QMARK_BRACE_CALLOUT_CONTENTS` | - | - | - | - | - | - | Yes | Yes | Yes | - | +| 29 | `ONIG_SYN_OP2_ASTERISK_CALLOUT_NAME` | - | - | - | - | - | - | Yes | Yes | Yes | - | +| 30 | `ONIG_SYN_OP2_OPTION_ONIGURUMA` | - | - | - | - | - | - | - | - | - | Yes | + +### Syntax Flags (syn) + +| ID | Option | PosB | PosEx | Emacs | Grep | Gnu | Java | Perl | PeNG | Ruby | Onig | +| ----- | --------------------------------------------- | ----- | ----- | ----- | ----- | ----- | ----- | ----- | ----- | ----- | ----- | +| 0 | `ONIG_SYN_CONTEXT_INDEP_REPEAT_OPS` | - | Yes | - | - | Yes | Yes | Yes | Yes | Yes | Yes | +| 1 | `ONIG_SYN_CONTEXT_INVALID_REPEAT_OPS` | - | - | - | - | Yes | Yes | Yes | Yes | Yes | Yes | +| 2 | `ONIG_SYN_ALLOW_UNMATCHED_CLOSE_SUBEXP` | - | Yes | - | - | - | - | - | - | - | - | +| 3 | `ONIG_SYN_ALLOW_INVALID_INTERVAL` | - | - | - | - | Yes | Yes | Yes | Yes | Yes | Yes | +| 4 | `ONIG_SYN_ALLOW_INTERVAL_LOW_ABBREV` | - | - | - | - | - | - | - | - | Yes | Yes | +| 5 | `ONIG_SYN_STRICT_CHECK_BACKREF` | - | - | - | - | - | - | - | - | - | - | +| 6 | `ONIG_SYN_DIFFERENT_LEN_ALT_LOOK_BEHIND` | - | - | - | - | - | - | - | Yes | Yes | Yes | +| 7 | `ONIG_SYN_CAPTURE_ONLY_NAMED_GROUP` | - | - | - | - | - | - | - | Yes | Yes | Yes | +| 8 | `ONIG_SYN_ALLOW_MULTIPLEX_DEFINITION_NAME` | - | - | - | - | - | - | - | Yes | Yes | Yes | +| 9 | `ONIG_SYN_FIXED_INTERVAL_IS_GREEDY_ONLY` | - | - | - | - | - | - | - | - | Yes | Yes | +| 20 | `ONIG_SYN_NOT_NEWLINE_IN_NEGATIVE_CC` | - | - | - | Yes | - | - | - | - | - | - | +| 21 | `ONIG_SYN_BACKSLASH_ESCAPE_IN_CC` | - | - | - | - | Yes | Yes | Yes | Yes | Yes | Yes | +| 22 | `ONIG_SYN_ALLOW_EMPTY_RANGE_IN_CC` | - | - | Yes | Yes | - | - | - | - | - | - | +| 23 | `ONIG_SYN_ALLOW_DOUBLE_RANGE_OP_IN_CC` | - | Yes | - | - | Yes | Yes | Yes | Yes | Yes | Yes | +| 24 | `ONIG_SYN_WARN_CC_OP_NOT_ESCAPED` | - | - | - | - | - | - | - | - | Yes | Yes | +| 25 | `ONIG_SYN_WARN_REDUNDANT_NESTED_REPEAT` | - | - | - | - | - | - | - | - | Yes | Yes | +| 31 | `ONIG_SYN_CONTEXT_INDEP_ANCHORS` | - | Yes | - | - | Yes | Yes | Yes | Yes | Yes | Yes | diff --git a/doc/UNICODE_PROPERTIES b/doc/UNICODE_PROPERTIES index 1f961eb..1148b4d 100644 --- a/doc/UNICODE_PROPERTIES +++ b/doc/UNICODE_PROPERTIES @@ -1,4 +1,4 @@ -Unicode Properties (from Unicode Version: 11.0.0) +Unicode Properties (from Unicode Version: 12.0.0) 15: ASCII_Hex_Digit 16: Adlam @@ -56,203 +56,207 @@ Unicode Properties (from Unicode Version: 11.0.0) 68: Duployan 69: Egyptian_Hieroglyphs 70: Elbasan - 71: Emoji - 72: Emoji_Component - 73: Emoji_Modifier - 74: Emoji_Modifier_Base - 75: Emoji_Presentation - 76: Ethiopic - 77: Extended_Pictographic - 78: Extender - 79: Georgian - 80: Glagolitic - 81: Gothic - 82: Grantha - 83: Grapheme_Base - 84: Grapheme_Extend - 85: Grapheme_Link - 86: Greek - 87: Gujarati - 88: Gunjala_Gondi - 89: Gurmukhi - 90: Han - 91: Hangul - 92: Hanifi_Rohingya - 93: Hanunoo - 94: Hatran - 95: Hebrew - 96: Hex_Digit - 97: Hiragana - 98: Hyphen - 99: IDS_Binary_Operator -100: IDS_Trinary_Operator -101: ID_Continue -102: ID_Start -103: Ideographic -104: Imperial_Aramaic -105: Inherited -106: Inscriptional_Pahlavi -107: Inscriptional_Parthian -108: Javanese -109: Join_Control -110: Kaithi -111: Kannada -112: Katakana -113: Kayah_Li -114: Kharoshthi -115: Khmer -116: Khojki -117: Khudawadi -118: L -119: LC -120: Lao -121: Latin -122: Lepcha -123: Limbu -124: Linear_A -125: Linear_B -126: Lisu -127: Ll -128: Lm -129: Lo -130: Logical_Order_Exception -131: Lowercase -132: Lt -133: Lu -134: Lycian -135: Lydian -136: M -137: Mahajani -138: Makasar -139: Malayalam -140: Mandaic -141: Manichaean -142: Marchen -143: Masaram_Gondi -144: Math -145: Mc -146: Me -147: Medefaidrin -148: Meetei_Mayek -149: Mende_Kikakui -150: Meroitic_Cursive -151: Meroitic_Hieroglyphs -152: Miao -153: Mn -154: Modi -155: Mongolian -156: Mro -157: Multani -158: Myanmar -159: N -160: Nabataean -161: Nd -162: New_Tai_Lue -163: Newa -164: Nko -165: Nl -166: No -167: Noncharacter_Code_Point -168: Nushu -169: Ogham -170: Ol_Chiki -171: Old_Hungarian -172: Old_Italic -173: Old_North_Arabian -174: Old_Permic -175: Old_Persian -176: Old_Sogdian -177: Old_South_Arabian -178: Old_Turkic -179: Oriya -180: Osage -181: Osmanya -182: Other_Alphabetic -183: Other_Default_Ignorable_Code_Point -184: Other_Grapheme_Extend -185: Other_ID_Continue -186: Other_ID_Start -187: Other_Lowercase -188: Other_Math -189: Other_Uppercase -190: P -191: Pahawh_Hmong -192: Palmyrene -193: Pattern_Syntax -194: Pattern_White_Space -195: Pau_Cin_Hau -196: Pc -197: Pd -198: Pe -199: Pf -200: Phags_Pa -201: Phoenician -202: Pi -203: Po -204: Prepended_Concatenation_Mark -205: Ps -206: Psalter_Pahlavi -207: Quotation_Mark -208: Radical -209: Regional_Indicator -210: Rejang -211: Runic -212: S -213: Samaritan -214: Saurashtra -215: Sc -216: Sentence_Terminal -217: Sharada -218: Shavian -219: Siddham -220: SignWriting -221: Sinhala -222: Sk -223: Sm -224: So -225: Soft_Dotted -226: Sogdian -227: Sora_Sompeng -228: Soyombo -229: Sundanese -230: Syloti_Nagri -231: Syriac -232: Tagalog -233: Tagbanwa -234: Tai_Le -235: Tai_Tham -236: Tai_Viet -237: Takri -238: Tamil -239: Tangut -240: Telugu -241: Terminal_Punctuation -242: Thaana -243: Thai -244: Tibetan -245: Tifinagh -246: Tirhuta -247: Ugaritic -248: Unified_Ideograph -249: Unknown -250: Uppercase -251: Vai -252: Variation_Selector -253: Warang_Citi -254: White_Space -255: XID_Continue -256: XID_Start -257: Yi -258: Z -259: Zanabazar_Square -260: Zl -261: Zp -262: Zs + 71: Elymaic + 72: Emoji + 73: Emoji_Component + 74: Emoji_Modifier + 75: Emoji_Modifier_Base + 76: Emoji_Presentation + 77: Ethiopic + 78: Extended_Pictographic + 79: Extender + 80: Georgian + 81: Glagolitic + 82: Gothic + 83: Grantha + 84: Grapheme_Base + 85: Grapheme_Extend + 86: Grapheme_Link + 87: Greek + 88: Gujarati + 89: Gunjala_Gondi + 90: Gurmukhi + 91: Han + 92: Hangul + 93: Hanifi_Rohingya + 94: Hanunoo + 95: Hatran + 96: Hebrew + 97: Hex_Digit + 98: Hiragana + 99: Hyphen +100: IDS_Binary_Operator +101: IDS_Trinary_Operator +102: ID_Continue +103: ID_Start +104: Ideographic +105: Imperial_Aramaic +106: Inherited +107: Inscriptional_Pahlavi +108: Inscriptional_Parthian +109: Javanese +110: Join_Control +111: Kaithi +112: Kannada +113: Katakana +114: Kayah_Li +115: Kharoshthi +116: Khmer +117: Khojki +118: Khudawadi +119: L +120: LC +121: Lao +122: Latin +123: Lepcha +124: Limbu +125: Linear_A +126: Linear_B +127: Lisu +128: Ll +129: Lm +130: Lo +131: Logical_Order_Exception +132: Lowercase +133: Lt +134: Lu +135: Lycian +136: Lydian +137: M +138: Mahajani +139: Makasar +140: Malayalam +141: Mandaic +142: Manichaean +143: Marchen +144: Masaram_Gondi +145: Math +146: Mc +147: Me +148: Medefaidrin +149: Meetei_Mayek +150: Mende_Kikakui +151: Meroitic_Cursive +152: Meroitic_Hieroglyphs +153: Miao +154: Mn +155: Modi +156: Mongolian +157: Mro +158: Multani +159: Myanmar +160: N +161: Nabataean +162: Nandinagari +163: Nd +164: New_Tai_Lue +165: Newa +166: Nko +167: Nl +168: No +169: Noncharacter_Code_Point +170: Nushu +171: Nyiakeng_Puachue_Hmong +172: Ogham +173: Ol_Chiki +174: Old_Hungarian +175: Old_Italic +176: Old_North_Arabian +177: Old_Permic +178: Old_Persian +179: Old_Sogdian +180: Old_South_Arabian +181: Old_Turkic +182: Oriya +183: Osage +184: Osmanya +185: Other_Alphabetic +186: Other_Default_Ignorable_Code_Point +187: Other_Grapheme_Extend +188: Other_ID_Continue +189: Other_ID_Start +190: Other_Lowercase +191: Other_Math +192: Other_Uppercase +193: P +194: Pahawh_Hmong +195: Palmyrene +196: Pattern_Syntax +197: Pattern_White_Space +198: Pau_Cin_Hau +199: Pc +200: Pd +201: Pe +202: Pf +203: Phags_Pa +204: Phoenician +205: Pi +206: Po +207: Prepended_Concatenation_Mark +208: Ps +209: Psalter_Pahlavi +210: Quotation_Mark +211: Radical +212: Regional_Indicator +213: Rejang +214: Runic +215: S +216: Samaritan +217: Saurashtra +218: Sc +219: Sentence_Terminal +220: Sharada +221: Shavian +222: Siddham +223: SignWriting +224: Sinhala +225: Sk +226: Sm +227: So +228: Soft_Dotted +229: Sogdian +230: Sora_Sompeng +231: Soyombo +232: Sundanese +233: Syloti_Nagri +234: Syriac +235: Tagalog +236: Tagbanwa +237: Tai_Le +238: Tai_Tham +239: Tai_Viet +240: Takri +241: Tamil +242: Tangut +243: Telugu +244: Terminal_Punctuation +245: Thaana +246: Thai +247: Tibetan +248: Tifinagh +249: Tirhuta +250: Ugaritic +251: Unified_Ideograph +252: Unknown +253: Uppercase +254: Vai +255: Variation_Selector +256: Wancho +257: Warang_Citi +258: White_Space +259: XID_Continue +260: XID_Start +261: Yi +262: Z +263: Zanabazar_Square +264: Zl +265: Zp +266: Zs 16: Adlm 42: Aghb 15: AHex 21: Arab -104: Armi +105: Armi 22: Armn 24: Avst 25: Bali @@ -270,24 +274,24 @@ Unicode Properties (from Unicode Version: 11.0.0) 45: Cakm 38: Cans 39: Cari -119: Cased_Letter +120: Cased_Letter 52: Cher 40: CI -198: Close_Punctuation -136: Combining_Mark -196: Connector_Punctuation +201: Close_Punctuation +137: Combining_Mark +199: Connector_Punctuation 43: Control 56: Copt 59: Cprt -215: Currency_Symbol +218: Currency_Symbol 47: CWCF 48: CWCM 49: CWL 50: CWT 51: CWU 60: Cyrl -197: Dash_Punctuation -161: Decimal_Number +200: Dash_Punctuation +163: Decimal_Number 63: Dep 65: Deva 62: DI @@ -297,475 +301,488 @@ Unicode Properties (from Unicode Version: 11.0.0) 68: Dupl 69: Egyp 70: Elba -146: Enclosing_Mark - 76: Ethi - 78: Ext -199: Final_Punctuation + 71: Elym +147: Enclosing_Mark + 77: Ethi + 79: Ext +202: Final_Punctuation 44: Format - 79: Geor - 80: Glag - 88: Gong -143: Gonm - 81: Goth - 82: Gran - 83: Gr_Base - 86: Grek - 84: Gr_Ext - 85: Gr_Link - 87: Gujr - 89: Guru - 91: Hang - 90: Hani - 93: Hano - 94: Hatr - 95: Hebr - 96: Hex - 97: Hira + 80: Geor + 81: Glag + 89: Gong +144: Gonm + 82: Goth + 83: Gran + 84: Gr_Base + 87: Grek + 85: Gr_Ext + 86: Gr_Link + 88: Gujr + 90: Guru + 92: Hang + 91: Hani + 94: Hano + 95: Hatr + 96: Hebr + 97: Hex + 98: Hira 19: Hluw -191: Hmng -171: Hung -101: IDC -103: Ideo -102: IDS - 99: IDSB -100: IDST -202: Initial_Punctuation -172: Ital -108: Java -109: Join_C -113: Kali -112: Kana -114: Khar -115: Khmr -116: Khoj -111: Knda -110: Kthi -235: Lana -120: Laoo -121: Latn -122: Lepc -118: Letter -165: Letter_Number -123: Limb -124: Lina -125: Linb -260: Line_Separator -130: LOE -127: Lowercase_Letter -134: Lyci -135: Lydi -137: Mahj -138: Maka -140: Mand -141: Mani -142: Marc -136: Mark -223: Math_Symbol -147: Medf -149: Mend -150: Merc -151: Mero -139: Mlym -128: Modifier_Letter -222: Modifier_Symbol -155: Mong -156: Mroo -148: Mtei -157: Mult -158: Mymr -173: Narb -160: Nbat -167: NChar -164: Nkoo -153: Nonspacing_Mark -168: Nshu -159: Number -182: OAlpha -183: ODI -169: Ogam -184: OGr_Ext -185: OIDC -186: OIDS -170: Olck -187: OLower -188: OMath -205: Open_Punctuation -178: Orkh -179: Orya -180: Osge -181: Osma +194: Hmng +171: Hmnp +174: Hung +102: IDC +104: Ideo +103: IDS +100: IDSB +101: IDST +205: Initial_Punctuation +175: Ital +109: Java +110: Join_C +114: Kali +113: Kana +115: Khar +116: Khmr +117: Khoj +112: Knda +111: Kthi +238: Lana +121: Laoo +122: Latn +123: Lepc +119: Letter +167: Letter_Number +124: Limb +125: Lina +126: Linb +264: Line_Separator +131: LOE +128: Lowercase_Letter +135: Lyci +136: Lydi +138: Mahj +139: Maka +141: Mand +142: Mani +143: Marc +137: Mark +226: Math_Symbol +148: Medf +150: Mend +151: Merc +152: Mero +140: Mlym +129: Modifier_Letter +225: Modifier_Symbol +156: Mong +157: Mroo +149: Mtei +158: Mult +159: Mymr +162: Nand +176: Narb +161: Nbat +169: NChar +166: Nkoo +154: Nonspacing_Mark +170: Nshu +160: Number +185: OAlpha +186: ODI +172: Ogam +187: OGr_Ext +188: OIDC +189: OIDS +173: Olck +190: OLower +191: OMath +208: Open_Punctuation +181: Orkh +182: Orya +183: Osge +184: Osma 37: Other -129: Other_Letter -166: Other_Number -203: Other_Punctuation -224: Other_Symbol -189: OUpper -192: Palm -261: Paragraph_Separator -193: Pat_Syn -194: Pat_WS -195: Pauc -204: PCM -174: Perm -200: Phag -106: Phli -206: Phlp -201: Phnx -152: Plrd +130: Other_Letter +168: Other_Number +206: Other_Punctuation +227: Other_Symbol +192: OUpper +195: Palm +265: Paragraph_Separator +196: Pat_Syn +197: Pat_WS +198: Pauc +207: PCM +177: Perm +203: Phag +107: Phli +209: Phlp +204: Phnx +153: Plrd 54: Private_Use -107: Prti -190: Punctuation +108: Prti +193: Punctuation 56: Qaac -105: Qaai -207: QMark -209: RI -210: Rjng - 92: Rohg -211: Runr -213: Samr -177: Sarb -214: Saur -225: SD -258: Separator -220: Sgnw -218: Shaw -217: Shrd -219: Sidd -117: Sind -221: Sinh -226: Sogd -176: Sogo -227: Sora -228: Soyo -262: Space_Separator -145: Spacing_Mark -216: STerm -229: Sund +106: Qaai +210: QMark +212: RI +213: Rjng + 93: Rohg +214: Runr +216: Samr +180: Sarb +217: Saur +228: SD +262: Separator +223: Sgnw +221: Shaw +220: Shrd +222: Sidd +118: Sind +224: Sinh +229: Sogd +179: Sogo +230: Sora +231: Soyo +266: Space_Separator +146: Spacing_Mark +219: STerm +232: Sund 57: Surrogate -230: Sylo -212: Symbol -231: Syrc -233: Tagb -237: Takr -234: Tale -162: Talu -238: Taml -239: Tang -236: Tavt -240: Telu -241: Term -245: Tfng -232: Tglg -242: Thaa -244: Tibt -246: Tirh -132: Titlecase_Letter -247: Ugar -248: UIdeo +233: Sylo +215: Symbol +234: Syrc +236: Tagb +240: Takr +237: Tale +164: Talu +241: Taml +242: Tang +239: Tavt +243: Telu +244: Term +248: Tfng +235: Tglg +245: Thaa +247: Tibt +249: Tirh +133: Titlecase_Letter +250: Ugar +251: UIdeo 53: Unassigned -133: Uppercase_Letter -251: Vaii -252: VS -253: Wara -254: WSpace -255: XIDC -256: XIDS -175: Xpeo +134: Uppercase_Letter +254: Vaii +255: VS +257: Wara +256: Wcho +258: WSpace +259: XIDC +260: XIDS +178: Xpeo 58: Xsux -257: Yiii -259: Zanb -105: Zinh +261: Yiii +263: Zanb +106: Zinh 55: Zyyy -249: Zzzz -263: In_Basic_Latin -264: In_Latin_1_Supplement -265: In_Latin_Extended_A -266: In_Latin_Extended_B -267: In_IPA_Extensions -268: In_Spacing_Modifier_Letters -269: In_Combining_Diacritical_Marks -270: In_Greek_and_Coptic -271: In_Cyrillic -272: In_Cyrillic_Supplement -273: In_Armenian -274: In_Hebrew -275: In_Arabic -276: In_Syriac -277: In_Arabic_Supplement -278: In_Thaana -279: In_NKo -280: In_Samaritan -281: In_Mandaic -282: In_Syriac_Supplement -283: In_Arabic_Extended_A -284: In_Devanagari -285: In_Bengali -286: In_Gurmukhi -287: In_Gujarati -288: In_Oriya -289: In_Tamil -290: In_Telugu -291: In_Kannada -292: In_Malayalam -293: In_Sinhala -294: In_Thai -295: In_Lao -296: In_Tibetan -297: In_Myanmar -298: In_Georgian -299: In_Hangul_Jamo -300: In_Ethiopic -301: In_Ethiopic_Supplement -302: In_Cherokee -303: In_Unified_Canadian_Aboriginal_Syllabics -304: In_Ogham -305: In_Runic -306: In_Tagalog -307: In_Hanunoo -308: In_Buhid -309: In_Tagbanwa -310: In_Khmer -311: In_Mongolian -312: In_Unified_Canadian_Aboriginal_Syllabics_Extended -313: In_Limbu -314: In_Tai_Le -315: In_New_Tai_Lue -316: In_Khmer_Symbols -317: In_Buginese -318: In_Tai_Tham -319: In_Combining_Diacritical_Marks_Extended -320: In_Balinese -321: In_Sundanese -322: In_Batak -323: In_Lepcha -324: In_Ol_Chiki -325: In_Cyrillic_Extended_C -326: In_Georgian_Extended -327: In_Sundanese_Supplement -328: In_Vedic_Extensions -329: In_Phonetic_Extensions -330: In_Phonetic_Extensions_Supplement -331: In_Combining_Diacritical_Marks_Supplement -332: In_Latin_Extended_Additional -333: In_Greek_Extended -334: In_General_Punctuation -335: In_Superscripts_and_Subscripts -336: In_Currency_Symbols -337: In_Combining_Diacritical_Marks_for_Symbols -338: In_Letterlike_Symbols -339: In_Number_Forms -340: In_Arrows -341: In_Mathematical_Operators -342: In_Miscellaneous_Technical -343: In_Control_Pictures -344: In_Optical_Character_Recognition -345: In_Enclosed_Alphanumerics -346: In_Box_Drawing -347: In_Block_Elements -348: In_Geometric_Shapes -349: In_Miscellaneous_Symbols -350: In_Dingbats -351: In_Miscellaneous_Mathematical_Symbols_A -352: In_Supplemental_Arrows_A -353: In_Braille_Patterns -354: In_Supplemental_Arrows_B -355: In_Miscellaneous_Mathematical_Symbols_B -356: In_Supplemental_Mathematical_Operators -357: In_Miscellaneous_Symbols_and_Arrows -358: In_Glagolitic -359: In_Latin_Extended_C -360: In_Coptic -361: In_Georgian_Supplement -362: In_Tifinagh -363: In_Ethiopic_Extended -364: In_Cyrillic_Extended_A -365: In_Supplemental_Punctuation -366: In_CJK_Radicals_Supplement -367: In_Kangxi_Radicals -368: In_Ideographic_Description_Characters -369: In_CJK_Symbols_and_Punctuation -370: In_Hiragana -371: In_Katakana -372: In_Bopomofo -373: In_Hangul_Compatibility_Jamo -374: In_Kanbun -375: In_Bopomofo_Extended -376: In_CJK_Strokes -377: In_Katakana_Phonetic_Extensions -378: In_Enclosed_CJK_Letters_and_Months -379: In_CJK_Compatibility -380: In_CJK_Unified_Ideographs_Extension_A -381: In_Yijing_Hexagram_Symbols -382: In_CJK_Unified_Ideographs -383: In_Yi_Syllables -384: In_Yi_Radicals -385: In_Lisu -386: In_Vai -387: In_Cyrillic_Extended_B -388: In_Bamum -389: In_Modifier_Tone_Letters -390: In_Latin_Extended_D -391: In_Syloti_Nagri -392: In_Common_Indic_Number_Forms -393: In_Phags_pa -394: In_Saurashtra -395: In_Devanagari_Extended -396: In_Kayah_Li -397: In_Rejang -398: In_Hangul_Jamo_Extended_A -399: In_Javanese -400: In_Myanmar_Extended_B -401: In_Cham -402: In_Myanmar_Extended_A -403: In_Tai_Viet -404: In_Meetei_Mayek_Extensions -405: In_Ethiopic_Extended_A -406: In_Latin_Extended_E -407: In_Cherokee_Supplement -408: In_Meetei_Mayek -409: In_Hangul_Syllables -410: In_Hangul_Jamo_Extended_B -411: In_High_Surrogates -412: In_High_Private_Use_Surrogates -413: In_Low_Surrogates -414: In_Private_Use_Area -415: In_CJK_Compatibility_Ideographs -416: In_Alphabetic_Presentation_Forms -417: In_Arabic_Presentation_Forms_A -418: In_Variation_Selectors -419: In_Vertical_Forms -420: In_Combining_Half_Marks -421: In_CJK_Compatibility_Forms -422: In_Small_Form_Variants -423: In_Arabic_Presentation_Forms_B -424: In_Halfwidth_and_Fullwidth_Forms -425: In_Specials -426: In_Linear_B_Syllabary -427: In_Linear_B_Ideograms -428: In_Aegean_Numbers -429: In_Ancient_Greek_Numbers -430: In_Ancient_Symbols -431: In_Phaistos_Disc -432: In_Lycian -433: In_Carian -434: In_Coptic_Epact_Numbers -435: In_Old_Italic -436: In_Gothic -437: In_Old_Permic -438: In_Ugaritic -439: In_Old_Persian -440: In_Deseret -441: In_Shavian -442: In_Osmanya -443: In_Osage -444: In_Elbasan -445: In_Caucasian_Albanian -446: In_Linear_A -447: In_Cypriot_Syllabary -448: In_Imperial_Aramaic -449: In_Palmyrene -450: In_Nabataean -451: In_Hatran -452: In_Phoenician -453: In_Lydian -454: In_Meroitic_Hieroglyphs -455: In_Meroitic_Cursive -456: In_Kharoshthi -457: In_Old_South_Arabian -458: In_Old_North_Arabian -459: In_Manichaean -460: In_Avestan -461: In_Inscriptional_Parthian -462: In_Inscriptional_Pahlavi -463: In_Psalter_Pahlavi -464: In_Old_Turkic -465: In_Old_Hungarian -466: In_Hanifi_Rohingya -467: In_Rumi_Numeral_Symbols -468: In_Old_Sogdian -469: In_Sogdian -470: In_Brahmi -471: In_Kaithi -472: In_Sora_Sompeng -473: In_Chakma -474: In_Mahajani -475: In_Sharada -476: In_Sinhala_Archaic_Numbers -477: In_Khojki -478: In_Multani -479: In_Khudawadi -480: In_Grantha -481: In_Newa -482: In_Tirhuta -483: In_Siddham -484: In_Modi -485: In_Mongolian_Supplement -486: In_Takri -487: In_Ahom -488: In_Dogra -489: In_Warang_Citi -490: In_Zanabazar_Square -491: In_Soyombo -492: In_Pau_Cin_Hau -493: In_Bhaiksuki -494: In_Marchen -495: In_Masaram_Gondi -496: In_Gunjala_Gondi -497: In_Makasar -498: In_Cuneiform -499: In_Cuneiform_Numbers_and_Punctuation -500: In_Early_Dynastic_Cuneiform -501: In_Egyptian_Hieroglyphs -502: In_Anatolian_Hieroglyphs -503: In_Bamum_Supplement -504: In_Mro -505: In_Bassa_Vah -506: In_Pahawh_Hmong -507: In_Medefaidrin -508: In_Miao -509: In_Ideographic_Symbols_and_Punctuation -510: In_Tangut -511: In_Tangut_Components -512: In_Kana_Supplement -513: In_Kana_Extended_A -514: In_Nushu -515: In_Duployan -516: In_Shorthand_Format_Controls -517: In_Byzantine_Musical_Symbols -518: In_Musical_Symbols -519: In_Ancient_Greek_Musical_Notation -520: In_Mayan_Numerals -521: In_Tai_Xuan_Jing_Symbols -522: In_Counting_Rod_Numerals -523: In_Mathematical_Alphanumeric_Symbols -524: In_Sutton_SignWriting -525: In_Glagolitic_Supplement -526: In_Mende_Kikakui -527: In_Adlam -528: In_Indic_Siyaq_Numbers -529: In_Arabic_Mathematical_Alphabetic_Symbols -530: In_Mahjong_Tiles -531: In_Domino_Tiles -532: In_Playing_Cards -533: In_Enclosed_Alphanumeric_Supplement -534: In_Enclosed_Ideographic_Supplement -535: In_Miscellaneous_Symbols_and_Pictographs -536: In_Emoticons -537: In_Ornamental_Dingbats -538: In_Transport_and_Map_Symbols -539: In_Alchemical_Symbols -540: In_Geometric_Shapes_Extended -541: In_Supplemental_Arrows_C -542: In_Supplemental_Symbols_and_Pictographs -543: In_Chess_Symbols -544: In_CJK_Unified_Ideographs_Extension_B -545: In_CJK_Unified_Ideographs_Extension_C -546: In_CJK_Unified_Ideographs_Extension_D -547: In_CJK_Unified_Ideographs_Extension_E -548: In_CJK_Unified_Ideographs_Extension_F -549: In_CJK_Compatibility_Ideographs_Supplement -550: In_Tags -551: In_Variation_Selectors_Supplement -552: In_Supplementary_Private_Use_Area_A -553: In_Supplementary_Private_Use_Area_B -554: In_No_Block +252: Zzzz +267: In_Basic_Latin +268: In_Latin_1_Supplement +269: In_Latin_Extended_A +270: In_Latin_Extended_B +271: In_IPA_Extensions +272: In_Spacing_Modifier_Letters +273: In_Combining_Diacritical_Marks +274: In_Greek_and_Coptic +275: In_Cyrillic +276: In_Cyrillic_Supplement +277: In_Armenian +278: In_Hebrew +279: In_Arabic +280: In_Syriac +281: In_Arabic_Supplement +282: In_Thaana +283: In_NKo +284: In_Samaritan +285: In_Mandaic +286: In_Syriac_Supplement +287: In_Arabic_Extended_A +288: In_Devanagari +289: In_Bengali +290: In_Gurmukhi +291: In_Gujarati +292: In_Oriya +293: In_Tamil +294: In_Telugu +295: In_Kannada +296: In_Malayalam +297: In_Sinhala +298: In_Thai +299: In_Lao +300: In_Tibetan +301: In_Myanmar +302: In_Georgian +303: In_Hangul_Jamo +304: In_Ethiopic +305: In_Ethiopic_Supplement +306: In_Cherokee +307: In_Unified_Canadian_Aboriginal_Syllabics +308: In_Ogham +309: In_Runic +310: In_Tagalog +311: In_Hanunoo +312: In_Buhid +313: In_Tagbanwa +314: In_Khmer +315: In_Mongolian +316: In_Unified_Canadian_Aboriginal_Syllabics_Extended +317: In_Limbu +318: In_Tai_Le +319: In_New_Tai_Lue +320: In_Khmer_Symbols +321: In_Buginese +322: In_Tai_Tham +323: In_Combining_Diacritical_Marks_Extended +324: In_Balinese +325: In_Sundanese +326: In_Batak +327: In_Lepcha +328: In_Ol_Chiki +329: In_Cyrillic_Extended_C +330: In_Georgian_Extended +331: In_Sundanese_Supplement +332: In_Vedic_Extensions +333: In_Phonetic_Extensions +334: In_Phonetic_Extensions_Supplement +335: In_Combining_Diacritical_Marks_Supplement +336: In_Latin_Extended_Additional +337: In_Greek_Extended +338: In_General_Punctuation +339: In_Superscripts_and_Subscripts +340: In_Currency_Symbols +341: In_Combining_Diacritical_Marks_for_Symbols +342: In_Letterlike_Symbols +343: In_Number_Forms +344: In_Arrows +345: In_Mathematical_Operators +346: In_Miscellaneous_Technical +347: In_Control_Pictures +348: In_Optical_Character_Recognition +349: In_Enclosed_Alphanumerics +350: In_Box_Drawing +351: In_Block_Elements +352: In_Geometric_Shapes +353: In_Miscellaneous_Symbols +354: In_Dingbats +355: In_Miscellaneous_Mathematical_Symbols_A +356: In_Supplemental_Arrows_A +357: In_Braille_Patterns +358: In_Supplemental_Arrows_B +359: In_Miscellaneous_Mathematical_Symbols_B +360: In_Supplemental_Mathematical_Operators +361: In_Miscellaneous_Symbols_and_Arrows +362: In_Glagolitic +363: In_Latin_Extended_C +364: In_Coptic +365: In_Georgian_Supplement +366: In_Tifinagh +367: In_Ethiopic_Extended +368: In_Cyrillic_Extended_A +369: In_Supplemental_Punctuation +370: In_CJK_Radicals_Supplement +371: In_Kangxi_Radicals +372: In_Ideographic_Description_Characters +373: In_CJK_Symbols_and_Punctuation +374: In_Hiragana +375: In_Katakana +376: In_Bopomofo +377: In_Hangul_Compatibility_Jamo +378: In_Kanbun +379: In_Bopomofo_Extended +380: In_CJK_Strokes +381: In_Katakana_Phonetic_Extensions +382: In_Enclosed_CJK_Letters_and_Months +383: In_CJK_Compatibility +384: In_CJK_Unified_Ideographs_Extension_A +385: In_Yijing_Hexagram_Symbols +386: In_CJK_Unified_Ideographs +387: In_Yi_Syllables +388: In_Yi_Radicals +389: In_Lisu +390: In_Vai +391: In_Cyrillic_Extended_B +392: In_Bamum +393: In_Modifier_Tone_Letters +394: In_Latin_Extended_D +395: In_Syloti_Nagri +396: In_Common_Indic_Number_Forms +397: In_Phags_pa +398: In_Saurashtra +399: In_Devanagari_Extended +400: In_Kayah_Li +401: In_Rejang +402: In_Hangul_Jamo_Extended_A +403: In_Javanese +404: In_Myanmar_Extended_B +405: In_Cham +406: In_Myanmar_Extended_A +407: In_Tai_Viet +408: In_Meetei_Mayek_Extensions +409: In_Ethiopic_Extended_A +410: In_Latin_Extended_E +411: In_Cherokee_Supplement +412: In_Meetei_Mayek +413: In_Hangul_Syllables +414: In_Hangul_Jamo_Extended_B +415: In_High_Surrogates +416: In_High_Private_Use_Surrogates +417: In_Low_Surrogates +418: In_Private_Use_Area +419: In_CJK_Compatibility_Ideographs +420: In_Alphabetic_Presentation_Forms +421: In_Arabic_Presentation_Forms_A +422: In_Variation_Selectors +423: In_Vertical_Forms +424: In_Combining_Half_Marks +425: In_CJK_Compatibility_Forms +426: In_Small_Form_Variants +427: In_Arabic_Presentation_Forms_B +428: In_Halfwidth_and_Fullwidth_Forms +429: In_Specials +430: In_Linear_B_Syllabary +431: In_Linear_B_Ideograms +432: In_Aegean_Numbers +433: In_Ancient_Greek_Numbers +434: In_Ancient_Symbols +435: In_Phaistos_Disc +436: In_Lycian +437: In_Carian +438: In_Coptic_Epact_Numbers +439: In_Old_Italic +440: In_Gothic +441: In_Old_Permic +442: In_Ugaritic +443: In_Old_Persian +444: In_Deseret +445: In_Shavian +446: In_Osmanya +447: In_Osage +448: In_Elbasan +449: In_Caucasian_Albanian +450: In_Linear_A +451: In_Cypriot_Syllabary +452: In_Imperial_Aramaic +453: In_Palmyrene +454: In_Nabataean +455: In_Hatran +456: In_Phoenician +457: In_Lydian +458: In_Meroitic_Hieroglyphs +459: In_Meroitic_Cursive +460: In_Kharoshthi +461: In_Old_South_Arabian +462: In_Old_North_Arabian +463: In_Manichaean +464: In_Avestan +465: In_Inscriptional_Parthian +466: In_Inscriptional_Pahlavi +467: In_Psalter_Pahlavi +468: In_Old_Turkic +469: In_Old_Hungarian +470: In_Hanifi_Rohingya +471: In_Rumi_Numeral_Symbols +472: In_Old_Sogdian +473: In_Sogdian +474: In_Elymaic +475: In_Brahmi +476: In_Kaithi +477: In_Sora_Sompeng +478: In_Chakma +479: In_Mahajani +480: In_Sharada +481: In_Sinhala_Archaic_Numbers +482: In_Khojki +483: In_Multani +484: In_Khudawadi +485: In_Grantha +486: In_Newa +487: In_Tirhuta +488: In_Siddham +489: In_Modi +490: In_Mongolian_Supplement +491: In_Takri +492: In_Ahom +493: In_Dogra +494: In_Warang_Citi +495: In_Nandinagari +496: In_Zanabazar_Square +497: In_Soyombo +498: In_Pau_Cin_Hau +499: In_Bhaiksuki +500: In_Marchen +501: In_Masaram_Gondi +502: In_Gunjala_Gondi +503: In_Makasar +504: In_Tamil_Supplement +505: In_Cuneiform +506: In_Cuneiform_Numbers_and_Punctuation +507: In_Early_Dynastic_Cuneiform +508: In_Egyptian_Hieroglyphs +509: In_Egyptian_Hieroglyph_Format_Controls +510: In_Anatolian_Hieroglyphs +511: In_Bamum_Supplement +512: In_Mro +513: In_Bassa_Vah +514: In_Pahawh_Hmong +515: In_Medefaidrin +516: In_Miao +517: In_Ideographic_Symbols_and_Punctuation +518: In_Tangut +519: In_Tangut_Components +520: In_Kana_Supplement +521: In_Kana_Extended_A +522: In_Small_Kana_Extension +523: In_Nushu +524: In_Duployan +525: In_Shorthand_Format_Controls +526: In_Byzantine_Musical_Symbols +527: In_Musical_Symbols +528: In_Ancient_Greek_Musical_Notation +529: In_Mayan_Numerals +530: In_Tai_Xuan_Jing_Symbols +531: In_Counting_Rod_Numerals +532: In_Mathematical_Alphanumeric_Symbols +533: In_Sutton_SignWriting +534: In_Glagolitic_Supplement +535: In_Nyiakeng_Puachue_Hmong +536: In_Wancho +537: In_Mende_Kikakui +538: In_Adlam +539: In_Indic_Siyaq_Numbers +540: In_Ottoman_Siyaq_Numbers +541: In_Arabic_Mathematical_Alphabetic_Symbols +542: In_Mahjong_Tiles +543: In_Domino_Tiles +544: In_Playing_Cards +545: In_Enclosed_Alphanumeric_Supplement +546: In_Enclosed_Ideographic_Supplement +547: In_Miscellaneous_Symbols_and_Pictographs +548: In_Emoticons +549: In_Ornamental_Dingbats +550: In_Transport_and_Map_Symbols +551: In_Alchemical_Symbols +552: In_Geometric_Shapes_Extended +553: In_Supplemental_Arrows_C +554: In_Supplemental_Symbols_and_Pictographs +555: In_Chess_Symbols +556: In_Symbols_and_Pictographs_Extended_A +557: In_CJK_Unified_Ideographs_Extension_B +558: In_CJK_Unified_Ideographs_Extension_C +559: In_CJK_Unified_Ideographs_Extension_D +560: In_CJK_Unified_Ideographs_Extension_E +561: In_CJK_Unified_Ideographs_Extension_F +562: In_CJK_Compatibility_Ideographs_Supplement +563: In_Tags +564: In_Variation_Selectors_Supplement +565: In_Supplementary_Private_Use_Area_A +566: In_Supplementary_Private_Use_Area_B +567: In_No_Block -- cgit v1.2.3