summaryrefslogtreecommitdiff
path: root/src/make_unicode_property_data.py
diff options
context:
space:
mode:
Diffstat (limited to 'src/make_unicode_property_data.py')
-rwxr-xr-xsrc/make_unicode_property_data.py545
1 files changed, 545 insertions, 0 deletions
diff --git a/src/make_unicode_property_data.py b/src/make_unicode_property_data.py
new file mode 100755
index 0000000..25ed092
--- /dev/null
+++ b/src/make_unicode_property_data.py
@@ -0,0 +1,545 @@
+#!/usr/bin/python
+# -*- coding: utf-8 -*-
+
+import sys
+import re
+
+POSIX_LIST = [
+ 'NEWLINE', 'Alpha', 'Blank', 'Cntrl', 'Digit', 'Graph', 'Lower',
+ 'Print', 'Punct', 'Space', 'Upper', 'XDigit', 'Word', 'Alnum', 'ASCII'
+]
+
+MAX_CODE_POINT = 0x10ffff
+
+UD_FIRST_REG = re.compile("<.+,\s*First>")
+UD_LAST_REG = re.compile("<.+,\s*Last>")
+PR_TOTAL_REG = re.compile("#\s*Total\s+code\s+points:")
+PR_LINE_REG = re.compile("([0-9A-Fa-f]+)(?:..([0-9A-Fa-f]+))?\s*;\s*(\w+)")
+PA_LINE_REG = re.compile("(\w+)\s*;\s*(\w+)")
+PVA_LINE_REG = re.compile("(sc|gc)\s*;\s*(\w+)\s*;\s*(\w+)(?:\s*;\s*(\w+))?")
+BL_LINE_REG = re.compile("([0-9A-Fa-f]+)\.\.([0-9A-Fa-f]+)\s*;\s*(.*)")
+VERSION_REG = re.compile("#\s*.*-(\d\.\d\.\d)\.txt")
+
+VERSION_INFO = None
+DIC = { }
+KDIC = { }
+PropIndex = { }
+PROPERTY_NAME_MAX_LEN = 0
+
+def normalize_prop_name(name):
+ name = re.sub(r'[ _]', '', name)
+ name = name.lower()
+ return name
+
+def fix_block_name(name):
+ s = re.sub(r'[- ]+', '_', name)
+ return 'In_' + s
+
+def check_version_info(s):
+ global VERSION_INFO
+ m = VERSION_REG.match(s)
+ if m is not None:
+ VERSION_INFO = m.group(1)
+
+
+def print_ranges(ranges):
+ for (start, end) in ranges:
+ print "0x%06x, 0x%06x" % (start, end)
+
+ print len(ranges)
+
+def print_prop_and_index(prop, i):
+ print "%-35s %3d" % (prop + ',', i)
+ PropIndex[prop] = i
+
+print_cache = { }
+
+def print_property(prop, data, desc):
+ print ''
+ print "/* PROPERTY: '%s': %s */" % (prop, desc)
+
+ prev_prop = dic_find_by_value(print_cache, data)
+ if prev_prop is not None:
+ print "#define CR_%s CR_%s" % (prop, prev_prop)
+ else:
+ print_cache[prop] = data
+ print "static const OnigCodePoint"
+ print "CR_%s[] = { %d," % (prop, len(data))
+ for (start, end) in data:
+ print "0x%04x, 0x%04x," % (start, end)
+
+ print "}; /* END of CR_%s */" % prop
+
+
+def dic_find_by_value(dic, v):
+ for key, val in dic.items():
+ if val == v:
+ return key
+
+ return None
+
+
+def normalize_ranges(in_ranges, sort=False):
+ if sort:
+ ranges = sorted(in_ranges)
+ else:
+ ranges = in_ranges
+
+ r = []
+ prev = None
+ for (start, end) in ranges:
+ if prev >= start - 1:
+ (pstart, pend) = r.pop()
+ end = max(pend, end)
+ start = pstart
+
+ r.append((start, end))
+ prev = end
+
+ return r
+
+def inverse_ranges(in_ranges):
+ r = []
+ prev = 0x000000
+ for (start, end) in in_ranges:
+ if prev < start:
+ r.append((prev, start - 1))
+
+ prev = end + 1
+
+ if prev < MAX_CODE_POINT:
+ r.append((prev, MAX_CODE_POINT))
+
+ return r
+
+def add_ranges(r1, r2):
+ r = r1 + r2
+ return normalize_ranges(r, True)
+
+def sub_one_range(one_range, rs):
+ r = []
+ (s1, e1) = one_range
+ n = len(rs)
+ for i in range(0, n):
+ (s2, e2) = rs[i]
+ if s2 >= s1 and s2 <= e1:
+ if s2 > s1:
+ r.append((s1, s2 - 1))
+ if e2 >= e1:
+ return r
+
+ s1 = e2 + 1
+ elif s2 < s1 and e2 >= s1:
+ if e2 < e1:
+ s1 = e2 + 1
+ else:
+ return r
+
+ r.append((s1, e1))
+ return r
+
+def sub_ranges(r1, r2):
+ r = []
+ for one_range in r1:
+ rs = sub_one_range(one_range, r2)
+ r.extend(rs)
+
+ return r
+
+def add_ranges_in_dic(dic):
+ r = []
+ for k, v in dic.items():
+ r = r + v
+
+ return normalize_ranges(r, True)
+
+def normalize_ranges_in_dic(dic, sort=False):
+ for k, v in dic.items():
+ r = normalize_ranges(v, sort)
+ dic[k] = r
+
+def merge_dic(to_dic, from_dic):
+ to_keys = to_dic.keys()
+ from_keys = from_dic.keys()
+ common = list(set(to_keys) & set(from_keys))
+ if len(common) != 0:
+ print >> sys.stderr, "merge_dic: collision: %s" % sorted(common)
+
+ to_dic.update(from_dic)
+
+def merge_props(to_props, from_props):
+ common = list(set(to_props) & set(from_props))
+ if len(common) != 0:
+ print >> sys.stderr, "merge_props: collision: %s" % sorted(common)
+
+ to_props.extend(from_props)
+
+def add_range_into_dic(dic, name, start, end):
+ d = dic.get(name, None)
+ if d is None:
+ d = [(start, end)]
+ dic[name] = d
+ else:
+ d.append((start, end))
+
+def list_sub(a, b):
+ x = set(a) - set(b)
+ return list(x)
+
+
+def parse_unicode_data_file(f):
+ dic = { }
+ assigned = []
+ for line in f:
+ s = line.strip()
+ if len(s) == 0:
+ continue
+ if s[0] == '#':
+ continue
+
+ a = s.split(';')
+ code = int(a[0], 16)
+ desc = a[1]
+ prop = a[2]
+ if UD_FIRST_REG.match(desc) is not None:
+ start = code
+ end = None
+ elif UD_LAST_REG.match(desc) is not None:
+ end = code
+ else:
+ start = end = code
+
+ if end is not None:
+ assigned.append((start, end))
+ add_range_into_dic(dic, prop, start, end)
+ if len(prop) == 2:
+ add_range_into_dic(dic, prop[0:1], start, end)
+
+ normalize_ranges_in_dic(dic)
+ return dic, assigned
+
+def parse_properties(path, klass):
+ with open(path, 'r') as f:
+ dic = { }
+ prop = None
+ props = []
+ for line in f:
+ s = line.strip()
+ if len(s) == 0:
+ continue
+
+ if s[0] == '#':
+ if VERSION_INFO is None:
+ check_version_info(s)
+
+ m = PR_LINE_REG.match(s)
+ if m:
+ prop = m.group(3)
+ if m.group(2):
+ start = int(m.group(1), 16)
+ end = int(m.group(2), 16)
+ add_range_into_dic(dic, prop, start, end)
+ else:
+ start = int(m.group(1), 16)
+ add_range_into_dic(dic, prop, start, start)
+
+ elif PR_TOTAL_REG.match(s) is not None:
+ KDIC[prop] = klass
+ props.append(prop)
+
+ normalize_ranges_in_dic(dic)
+ return (dic, props)
+
+def parse_property_aliases(path):
+ a = { }
+ with open(path, 'r') as f:
+ for line in f:
+ s = line.strip()
+ if len(s) == 0:
+ continue
+
+ m = PA_LINE_REG.match(s)
+ if not(m):
+ continue
+
+ if m.group(1) == m.group(2):
+ continue
+
+ a[m.group(1)] = m.group(2)
+
+ return a
+
+def parse_property_value_aliases(path):
+ a = { }
+ with open(path, 'r') as f:
+ for line in f:
+ s = line.strip()
+ if len(s) == 0:
+ continue
+
+ m = PVA_LINE_REG.match(s)
+ if not(m):
+ continue
+
+ cat = m.group(1)
+ x2 = m.group(2)
+ x3 = m.group(3)
+ x4 = m.group(4)
+ if cat == 'sc':
+ if x2 != x3:
+ a[x2] = x3
+ if x4 and x4 != x3:
+ a[x4] = x3
+ else:
+ if x2 != x3:
+ a[x3] = x2
+ if x4 and x4 != x2:
+ a[x4] = x2
+
+ return a
+
+def parse_blocks(path):
+ dic = { }
+ blocks = []
+ with open(path, 'r') as f:
+ for line in f:
+ s = line.strip()
+ if len(s) == 0:
+ continue
+
+ m = BL_LINE_REG.match(s)
+ if not(m):
+ continue
+
+ start = int(m.group(1), 16)
+ end = int(m.group(2), 16)
+ block = fix_block_name(m.group(3))
+ add_range_into_dic(dic, block, start, end)
+ blocks.append(block)
+
+ noblock = fix_block_name('No_Block')
+ dic[noblock] = inverse_ranges(add_ranges_in_dic(dic))
+ blocks.append(noblock)
+ return dic, blocks
+
+def add_primitive_props(assigned):
+ DIC['Assigned'] = normalize_ranges(assigned)
+ DIC['Any'] = [(0x000000, 0x10ffff)]
+ DIC['ASCII'] = [(0x000000, 0x00007f)]
+ DIC['NEWLINE'] = [(0x00000a, 0x00000a)]
+ DIC['Cn'] = inverse_ranges(DIC['Assigned'])
+ DIC['C'].extend(DIC['Cn'])
+ DIC['C'] = normalize_ranges(DIC['C'], True)
+
+ d = []
+ d.extend(DIC['Ll'])
+ d.extend(DIC['Lt'])
+ d.extend(DIC['Lu'])
+ DIC['LC'] = normalize_ranges(d, True)
+
+def add_posix_props(dic):
+ alnum = []
+ alnum.extend(dic['Alphabetic'])
+ alnum.extend(dic['Nd']) # Nd == Decimal_Number
+ alnum = normalize_ranges(alnum, True)
+
+ blank = [(0x0009, 0x0009)]
+ blank.extend(dic['Zs']) # Zs == Space_Separator
+ blank = normalize_ranges(blank, True)
+
+ word = []
+ word.extend(dic['Alphabetic'])
+ word.extend(dic['M']) # M == Mark
+ word.extend(dic['Nd'])
+ word.extend(dic['Pc']) # Pc == Connector_Punctuation
+ word = normalize_ranges(word, True)
+
+ graph = sub_ranges(dic['Any'], dic['White_Space'])
+ graph = sub_ranges(graph, dic['Cc'])
+ graph = sub_ranges(graph, dic['Cs']) # Cs == Surrogate
+ graph = sub_ranges(graph, dic['Cn']) # Cn == Unassigned
+ graph = normalize_ranges(graph, True)
+
+ p = []
+ p.extend(graph)
+ p.extend(dic['Zs'])
+ p = normalize_ranges(p, True)
+
+ dic['Alpha'] = dic['Alphabetic']
+ dic['Upper'] = dic['Uppercase']
+ dic['Lower'] = dic['Lowercase']
+ dic['Punct'] = dic['P'] # P == Punctuation
+ dic['Digit'] = dic['Nd']
+ dic['XDigit'] = [(0x0030, 0x0039), (0x0041, 0x0046), (0x0061, 0x0066)]
+ dic['Alnum'] = alnum
+ dic['Space'] = dic['White_Space']
+ dic['Blank'] = blank
+ dic['Cntrl'] = dic['Cc']
+ dic['Word'] = word
+ dic['Graph'] = graph
+ dic['Print'] = p
+
+
+def set_max_prop_name(name):
+ global PROPERTY_NAME_MAX_LEN
+ n = len(name)
+ if n > PROPERTY_NAME_MAX_LEN:
+ PROPERTY_NAME_MAX_LEN = n
+
+LIST_COUNTER = 1
+def entry_prop_name(name, index):
+ global LIST_COUNTER
+ set_max_prop_name(name)
+ if OUTPUT_LIST and index >= len(POSIX_LIST):
+ print >> UPF, "%3d: %s" % (LIST_COUNTER, name)
+ LIST_COUNTER += 1
+
+
+### main ###
+argv = sys.argv
+argc = len(argv)
+
+POSIX_ONLY = False
+if argc >= 2:
+ if argv[1] == '-posix':
+ POSIX_ONLY = True
+
+OUTPUT_LIST = not(POSIX_ONLY)
+
+with open('UnicodeData.txt', 'r') as f:
+ dic, assigned = parse_unicode_data_file(f)
+ DIC = dic
+ add_primitive_props(assigned)
+
+PROPS = DIC.keys()
+PROPS = list_sub(PROPS, POSIX_LIST)
+PROPS = sorted(PROPS)
+
+dic, props = parse_properties('DerivedCoreProperties.txt', 'Derived Property')
+merge_dic(DIC, dic)
+merge_props(PROPS, props)
+
+dic, props = parse_properties('Scripts.txt', 'Script')
+merge_dic(DIC, dic)
+merge_props(PROPS, props)
+DIC['Unknown'] = inverse_ranges(add_ranges_in_dic(dic))
+
+dic, props = parse_properties('PropList.txt', 'Binary Property')
+merge_dic(DIC, dic)
+merge_props(PROPS, props)
+PROPS.append('Unknown')
+KDIC['Unknown'] = 'Script'
+
+ALIASES = parse_property_aliases('PropertyAliases.txt')
+a = parse_property_value_aliases('PropertyValueAliases.txt')
+merge_dic(ALIASES, a)
+
+dic, BLOCKS = parse_blocks('Blocks.txt')
+merge_dic(DIC, dic)
+
+add_posix_props(DIC)
+
+s = '''%{
+/* Generated by make_unicode_property_data.py. */
+'''
+print s
+for prop in POSIX_LIST:
+ print_property(prop, DIC[prop], "POSIX [[:%s:]]" % prop)
+
+print ''
+
+if not(POSIX_ONLY):
+ for prop in PROPS:
+ klass = KDIC.get(prop, None)
+ if klass is None:
+ n = len(prop)
+ if n == 1:
+ klass = 'Major Category'
+ elif n == 2:
+ klass = 'General Category'
+ else:
+ klass = '-'
+
+ print_property(prop, DIC[prop], klass)
+
+ for block in BLOCKS:
+ print_property(block, DIC[block], 'Block')
+
+
+print ''
+print "static const OnigCodePoint*\nconst CodeRanges[] = {"
+
+for prop in POSIX_LIST:
+ print " CR_%s," % prop
+
+if not(POSIX_ONLY):
+ for prop in PROPS:
+ print " CR_%s," % prop
+
+ for prop in BLOCKS:
+ print " CR_%s," % prop
+
+s = '''};
+%}
+struct PropertyNameCtype {
+ char* name:
+ int ctype;
+};
+%%
+'''
+sys.stdout.write(s)
+
+if OUTPUT_LIST:
+ UPF = open("UNICODE_PROPERTIES", "w")
+ if VERSION_INFO is not None:
+ print >> UPF, "Unicode Properties (from Unicode Version: %s)" % VERSION_INFO
+ print >> UPF, ''
+
+index = -1
+for prop in POSIX_LIST:
+ index += 1
+ entry_prop_name(prop, index)
+ prop = normalize_prop_name(prop)
+ print_prop_and_index(prop, index)
+
+if not(POSIX_ONLY):
+ for prop in PROPS:
+ index += 1
+ entry_prop_name(prop, index)
+ prop = normalize_prop_name(prop)
+ print_prop_and_index(prop, index)
+
+ NALIASES = map(lambda (k,v):(normalize_prop_name(k), k, v), ALIASES.items())
+ NALIASES = sorted(NALIASES)
+ for (nk, k, v) in NALIASES:
+ nv = normalize_prop_name(v)
+ if PropIndex.get(nk, None) is not None:
+ print >> sys.stderr, "ALIASES: already exists: %s => %s" % (k, v)
+ continue
+ index = PropIndex.get(nv, None)
+ if index is None:
+ #print >> sys.stderr, "ALIASES: value is not exist: %s => %s" % (k, v)
+ continue
+
+ entry_prop_name(k, index)
+ print_prop_and_index(nk, index)
+
+ for name in BLOCKS:
+ index += 1
+ entry_prop_name(name, index)
+ name = normalize_prop_name(name)
+ print_prop_and_index(name, index)
+
+print '%%'
+print ''
+if VERSION_INFO is not None:
+ print "#define PROPERTY_VERSION %s" % re.sub(r'[\.-]', '_', VERSION_INFO)
+ print ''
+
+print "#define PROPERTY_NAME_MAX_SIZE %d" % (PROPERTY_NAME_MAX_LEN + 10)
+print "#define CODE_RANGES_NUM %d" % (index + 1)
+
+if OUTPUT_LIST:
+ UPF.close()
+
+sys.exit(0)