#!/usr/bin/python # -*- coding: utf-8 -*- # make_unicode_egcb_data.py # Copyright (c) 2017-2020 K.Kosako import sys import re MAX_CODE_POINT = 0x10ffff PR_TOTAL_REG = re.compile("#\s*Total\s+(?:code\s+points|elements):") PR_LINE_REG = re.compile("([0-9A-Fa-f]+)(?:..([0-9A-Fa-f]+))?\s*;\s*(\w+)") PA_LINE_REG = re.compile("(\w+)\s*;\s*(\w+)") PVA_LINE_REG = re.compile("(sc|gc)\s*;\s*(\w+)\s*;\s*(\w+)(?:\s*;\s*(\w+))?") BL_LINE_REG = re.compile("([0-9A-Fa-f]+)\.\.([0-9A-Fa-f]+)\s*;\s*(.*)") VERSION_REG = re.compile("#\s*.*-(\d+)\.(\d+)\.(\d+)\.txt") VERSION_INFO = [-1, -1, -1] DIC = { } PROPS = [] PropIndex = { } def check_version_info(s): m = VERSION_REG.match(s) if m is not None: VERSION_INFO[0] = int(m.group(1)) VERSION_INFO[1] = int(m.group(2)) VERSION_INFO[2] = int(m.group(3)) def print_ranges(ranges): for (start, end) in ranges: print "0x%06x, 0x%06x" % (start, end) def print_prop_and_index(prop, i): print "%-35s %3d" % (prop + ',', i) PropIndex[prop] = i def dic_find_by_value(dic, v): for key, val in dic.items(): if val == v: return key return None def normalize_ranges(in_ranges, sort=False): if sort: ranges = sorted(in_ranges) else: ranges = in_ranges r = [] prev = None for (start, end) in ranges: if prev >= start - 1: (pstart, pend) = r.pop() end = max(pend, end) start = pstart r.append((start, end)) prev = end return r def inverse_ranges(in_ranges): r = [] prev = 0x000000 for (start, end) in in_ranges: if prev < start: r.append((prev, start - 1)) prev = end + 1 if prev < MAX_CODE_POINT: r.append((prev, MAX_CODE_POINT)) return r def add_ranges(r1, r2): r = r1 + r2 return normalize_ranges(r, True) def sub_one_range(one_range, rs): r = [] (s1, e1) = one_range n = len(rs) for i in range(0, n): (s2, e2) = rs[i] if s2 >= s1 and s2 <= e1: if s2 > s1: r.append((s1, s2 - 1)) if e2 >= e1: return r s1 = e2 + 1 elif s2 < s1 and e2 >= s1: if e2 < e1: s1 = e2 + 1 else: return r r.append((s1, e1)) return r def sub_ranges(r1, r2): r = [] for one_range in r1: rs = sub_one_range(one_range, r2) r.extend(rs) return r def add_ranges_in_dic(dic): r = [] for k, v in dic.items(): r = r + v return normalize_ranges(r, True) def normalize_ranges_in_dic(dic, sort=False): for k, v in dic.items(): r = normalize_ranges(v, sort) dic[k] = r def merge_dic(to_dic, from_dic): to_keys = to_dic.keys() from_keys = from_dic.keys() common = list(set(to_keys) & set(from_keys)) if len(common) != 0: print >> sys.stderr, "merge_dic: collision: %s" % sorted(common) to_dic.update(from_dic) def merge_props(to_props, from_props): common = list(set(to_props) & set(from_props)) if len(common) != 0: print >> sys.stderr, "merge_props: collision: %s" % sorted(common) to_props.extend(from_props) def add_range_into_dic(dic, name, start, end): d = dic.get(name, None) if d is None: d = [(start, end)] dic[name] = d else: d.append((start, end)) def list_sub(a, b): x = set(a) - set(b) return list(x) def parse_properties(path): with open(path, 'r') as f: dic = { } prop = None props = [] for line in f: s = line.strip() if len(s) == 0: continue if s[0] == '#': if VERSION_INFO[0] < 0: check_version_info(s) m = PR_LINE_REG.match(s) if m: prop = m.group(3) if m.group(2): start = int(m.group(1), 16) end = int(m.group(2), 16) add_range_into_dic(dic, prop, start, end) else: start = int(m.group(1), 16) add_range_into_dic(dic, prop, start, start) elif PR_TOTAL_REG.match(s) is not None: props.append(prop) normalize_ranges_in_dic(dic) return (dic, props) ### main ### argv = sys.argv argc = len(argv) dic, props = parse_properties('GraphemeBreakProperty.txt') merge_dic(DIC, dic) merge_props(PROPS, props) PROPS = sorted(PROPS) print '/* unicode_egcb_data.c: Generated by make_unicode_egcb_data.py. */' COPYRIGHT = ''' /*- * Copyright (c) 2017-2020 K.Kosako * All rights reserved. * * Redistribution and use in source and binary forms, with or without * modification, are permitted provided that the following conditions * are met: * 1. Redistributions of source code must retain the above copyright * notice, this list of conditions and the following disclaimer. * 2. Redistributions in binary form must reproduce the above copyright * notice, this list of conditions and the following disclaimer in the * documentation and/or other materials provided with the distribution. * * THIS SOFTWARE IS PROVIDED BY THE AUTHOR AND CONTRIBUTORS ``AS IS'' AND * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE * ARE DISCLAIMED. IN NO EVENT SHALL THE AUTHOR OR CONTRIBUTORS BE LIABLE * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF * SUCH DAMAGE. */ '''.strip() print COPYRIGHT print '' if VERSION_INFO[0] < 0: raise RuntimeError("Version is not found") print "#define GRAPHEME_BREAK_PROPERTY_VERSION %02d%02d%02d" % (VERSION_INFO[0], VERSION_INFO[1], VERSION_INFO[2]) print '' ranges = [] for prop in PROPS: rs = DIC[prop] for (start, end) in rs: ranges.append((start, end, prop)) ranges = sorted(ranges, key=lambda x: x[0]) prev = -1 for (start, end, prop) in ranges: if prev >= start: raise ValueError("{2}:{0} - {1} range overlap prev value {3}".format(start, end, prop, prev)) print '/*' for prop in PROPS: print "%s" % prop print '*/' print '' num_ranges = len(ranges) print "static int EGCB_RANGE_NUM = %d;" % num_ranges print 'static EGCB_RANGE_TYPE EGCB_RANGES[] = {' for i, (start, end, prop) in enumerate(ranges): if i == num_ranges - 1: comma = '' else: comma = ',' type_name = 'EGCB_' + prop print " {0x%06x, 0x%06x, %s }%s" % (start, end, type_name, comma) print '};' sys.exit(0)