diff options
Diffstat (limited to 'src/make_unicode_wb_data.py')
-rwxr-xr-x | src/make_unicode_wb_data.py | 264 |
1 files changed, 264 insertions, 0 deletions
diff --git a/src/make_unicode_wb_data.py b/src/make_unicode_wb_data.py new file mode 100755 index 0000000..624fa7e --- /dev/null +++ b/src/make_unicode_wb_data.py @@ -0,0 +1,264 @@ +#!/usr/bin/python +# -*- coding: utf-8 -*- +# make_unicode_wb_data.py +# Copyright (c) 2019 K.Kosako + +import sys +import re + +MAX_CODE_POINT = 0x10ffff + +PR_TOTAL_REG = re.compile("#\s*Total\s+(?:code\s+points|elements):") +PR_LINE_REG = re.compile("([0-9A-Fa-f]+)(?:..([0-9A-Fa-f]+))?\s*;\s*(\w+)") +PA_LINE_REG = re.compile("(\w+)\s*;\s*(\w+)") +PVA_LINE_REG = re.compile("(sc|gc)\s*;\s*(\w+)\s*;\s*(\w+)(?:\s*;\s*(\w+))?") +BL_LINE_REG = re.compile("([0-9A-Fa-f]+)\.\.([0-9A-Fa-f]+)\s*;\s*(.*)") +VERSION_REG = re.compile("#\s*.*-(\d+\.\d+\.\d+)\.txt") + +VERSION_INFO = None +DIC = { } +PROPS = [] +PropIndex = { } + +def check_version_info(s): + global VERSION_INFO + m = VERSION_REG.match(s) + if m is not None: + VERSION_INFO = m.group(1) + +def print_ranges(ranges): + for (start, end) in ranges: + print "0x%06x, 0x%06x" % (start, end) + +def print_prop_and_index(prop, i): + print "%-35s %3d" % (prop + ',', i) + PropIndex[prop] = i + +def dic_find_by_value(dic, v): + for key, val in dic.items(): + if val == v: + return key + + return None + + +def normalize_ranges(in_ranges, sort=False): + if sort: + ranges = sorted(in_ranges) + else: + ranges = in_ranges + + r = [] + prev = None + for (start, end) in ranges: + if prev >= start - 1: + (pstart, pend) = r.pop() + end = max(pend, end) + start = pstart + + r.append((start, end)) + prev = end + + return r + +def inverse_ranges(in_ranges): + r = [] + prev = 0x000000 + for (start, end) in in_ranges: + if prev < start: + r.append((prev, start - 1)) + + prev = end + 1 + + if prev < MAX_CODE_POINT: + r.append((prev, MAX_CODE_POINT)) + + return r + +def add_ranges(r1, r2): + r = r1 + r2 + return normalize_ranges(r, True) + +def sub_one_range(one_range, rs): + r = [] + (s1, e1) = one_range + n = len(rs) + for i in range(0, n): + (s2, e2) = rs[i] + if s2 >= s1 and s2 <= e1: + if s2 > s1: + r.append((s1, s2 - 1)) + if e2 >= e1: + return r + + s1 = e2 + 1 + elif s2 < s1 and e2 >= s1: + if e2 < e1: + s1 = e2 + 1 + else: + return r + + r.append((s1, e1)) + return r + +def sub_ranges(r1, r2): + r = [] + for one_range in r1: + rs = sub_one_range(one_range, r2) + r.extend(rs) + + return r + +def add_ranges_in_dic(dic): + r = [] + for k, v in dic.items(): + r = r + v + + return normalize_ranges(r, True) + +def normalize_ranges_in_dic(dic, sort=False): + for k, v in dic.items(): + r = normalize_ranges(v, sort) + dic[k] = r + +def merge_dic(to_dic, from_dic): + to_keys = to_dic.keys() + from_keys = from_dic.keys() + common = list(set(to_keys) & set(from_keys)) + if len(common) != 0: + print >> sys.stderr, "merge_dic: collision: %s" % sorted(common) + + to_dic.update(from_dic) + +def merge_props(to_props, from_props): + common = list(set(to_props) & set(from_props)) + if len(common) != 0: + print >> sys.stderr, "merge_props: collision: %s" % sorted(common) + + to_props.extend(from_props) + +def add_range_into_dic(dic, name, start, end): + d = dic.get(name, None) + if d is None: + d = [(start, end)] + dic[name] = d + else: + d.append((start, end)) + +def list_sub(a, b): + x = set(a) - set(b) + return list(x) + +def parse_properties(path): + with open(path, 'r') as f: + dic = { } + prop = None + props = [] + for line in f: + s = line.strip() + if len(s) == 0: + continue + + if s[0] == '#': + if VERSION_INFO is None: + check_version_info(s) + + m = PR_LINE_REG.match(s) + if m: + prop = m.group(3) + if m.group(2): + start = int(m.group(1), 16) + end = int(m.group(2), 16) + add_range_into_dic(dic, prop, start, end) + else: + start = int(m.group(1), 16) + add_range_into_dic(dic, prop, start, start) + + elif PR_TOTAL_REG.match(s) is not None: + props.append(prop) + + normalize_ranges_in_dic(dic) + return (dic, props) + + +### main ### +argv = sys.argv +argc = len(argv) + +dic, props = parse_properties('WordBreakProperty.txt') +merge_dic(DIC, dic) +merge_props(PROPS, props) + +PROPS = sorted(PROPS) + +print '/* unicode_wb_data.c: Generated by make_unicode_wb_data.py. */' +COPYRIGHT = ''' +/*- + * Copyright (c) 2019 K.Kosako <kkosako0 AT gmail DOT com> + * All rights reserved. + * + * Redistribution and use in source and binary forms, with or without + * modification, are permitted provided that the following conditions + * are met: + * 1. Redistributions of source code must retain the above copyright + * notice, this list of conditions and the following disclaimer. + * 2. Redistributions in binary form must reproduce the above copyright + * notice, this list of conditions and the following disclaimer in the + * documentation and/or other materials provided with the distribution. + * + * THIS SOFTWARE IS PROVIDED BY THE AUTHOR AND CONTRIBUTORS ``AS IS'' AND + * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE + * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE + * ARE DISCLAIMED. IN NO EVENT SHALL THE AUTHOR OR CONTRIBUTORS BE LIABLE + * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL + * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS + * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) + * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT + * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY + * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF + * SUCH DAMAGE. + */ +'''.strip() + +print COPYRIGHT +print '' +if VERSION_INFO is not None: + print "#define WORD_BREAK_PROPERTY_VERSION %s" % re.sub(r'[\.-]', '_', VERSION_INFO) + print '' + +ranges = [] +for prop in PROPS: + rs = DIC[prop] + for (start, end) in rs: + ranges.append((start, end, prop)) + +ranges = sorted(ranges, key=lambda x: x[0]) + +prev = -1 +for (start, end, prop) in ranges: + if prev >= start: + raise ValueError("{2}:{0} - {1} range overlap prev value {3}".format(start, end, prop, prev)) + + +print '/*' +for prop in PROPS: + print "%s" % prop +print '*/' +print '' + +num_ranges = len(ranges) +print "static int WB_RANGE_NUM = %d;" % num_ranges + +print 'static WB_RANGE_TYPE WB_RANGES[] = {' +for i, (start, end, prop) in enumerate(ranges): + if i == num_ranges - 1: + comma = '' + else: + comma = ',' + + type_name = 'WB_' + prop + print " {0x%06x, 0x%06x, %s }%s" % (start, end, type_name, comma) + +print '};' + +sys.exit(0) |