1 files changed, 264 insertions, 0 deletions
diff --git a/src/make_unicode_wb_data.py b/src/make_unicode_wb_data.py
new file mode 100755
index 0000000..624fa7e
--- /dev/null
+++ b/src/make_unicode_wb_data.py
@@ -0,0 +1,264 @@
+#!/usr/bin/python
+# -*- coding: utf-8 -*-
+# make_unicode_wb_data.py
+# Copyright (c) 2019  K.Kosako
+
+import sys
+import re
+
+MAX_CODE_POINT = 0x10ffff
+
+PR_TOTAL_REG = re.compile("#\s*Total\s+(?:code\s+points|elements):")
+PR_LINE_REG  = re.compile("([0-9A-Fa-f]+)(?:..([0-9A-Fa-f]+))?\s*;\s*(\w+)")
+PA_LINE_REG  = re.compile("(\w+)\s*;\s*(\w+)")
+PVA_LINE_REG = re.compile("(sc|gc)\s*;\s*(\w+)\s*;\s*(\w+)(?:\s*;\s*(\w+))?")
+BL_LINE_REG  = re.compile("([0-9A-Fa-f]+)\.\.([0-9A-Fa-f]+)\s*;\s*(.*)")
+VERSION_REG  = re.compile("#\s*.*-(\d+\.\d+\.\d+)\.txt")
+
+VERSION_INFO = None
+DIC  = { }
+PROPS = []
+PropIndex = { }
+
+def check_version_info(s):
+  global VERSION_INFO
+  m = VERSION_REG.match(s)
+  if m is not None:
+    VERSION_INFO = m.group(1)
+
+def print_ranges(ranges):
+  for (start, end) in ranges:
+    print "0x%06x, 0x%06x" % (start, end)
+
+def print_prop_and_index(prop, i):
+  print "%-35s %3d" % (prop + ',', i)
+  PropIndex[prop] = i
+
+def dic_find_by_value(dic, v):
+  for key, val in dic.items():
+    if val == v:
+      return key
+
+  return None
+
+
+def normalize_ranges(in_ranges, sort=False):
+  if sort:
+    ranges = sorted(in_ranges)
+  else:
+    ranges = in_ranges
+
+  r = []
+  prev = None
+  for (start, end) in ranges:
+    if prev >= start - 1:
+      (pstart, pend) = r.pop()
+      end = max(pend, end)
+      start = pstart
+
+    r.append((start, end))
+    prev = end
+
+  return r
+
+def inverse_ranges(in_ranges):
+  r = []
+  prev = 0x000000
+  for (start, end) in in_ranges:
+    if prev < start:
+      r.append((prev, start - 1))
+
+    prev = end + 1
+
+  if prev < MAX_CODE_POINT:
+    r.append((prev, MAX_CODE_POINT))
+
+  return r
+
+def add_ranges(r1, r2):
+  r = r1 + r2
+  return normalize_ranges(r, True)
+
+def sub_one_range(one_range, rs):
+  r = []
+  (s1, e1) = one_range
+  n = len(rs)
+  for i in range(0, n):
+    (s2, e2) = rs[i]
+    if s2 >= s1 and s2 <= e1:
+      if s2 > s1:
+        r.append((s1, s2 - 1))
+      if e2 >= e1:
+        return r
+
+      s1 = e2 + 1
+    elif s2 < s1 and e2 >= s1:
+      if e2 < e1:
+        s1 = e2 + 1
+      else:
+        return r
+
+  r.append((s1, e1))
+  return r
+
+def sub_ranges(r1, r2):
+  r = []
+  for one_range in r1:
+    rs = sub_one_range(one_range, r2)
+    r.extend(rs)
+
+  return r
+
+def add_ranges_in_dic(dic):
+  r = []
+  for k, v in dic.items():
+    r = r + v
+
+  return normalize_ranges(r, True)
+
+def normalize_ranges_in_dic(dic, sort=False):
+  for k, v in dic.items():
+    r = normalize_ranges(v, sort)
+    dic[k] = r
+
+def merge_dic(to_dic, from_dic):
+  to_keys   = to_dic.keys()
+  from_keys = from_dic.keys()
+  common = list(set(to_keys) & set(from_keys))
+  if len(common) != 0:
+    print >> sys.stderr, "merge_dic: collision: %s" % sorted(common)
+
+  to_dic.update(from_dic)
+
+def merge_props(to_props, from_props):
+  common = list(set(to_props) & set(from_props))
+  if len(common) != 0:
+    print >> sys.stderr, "merge_props: collision: %s" % sorted(common)
+
+  to_props.extend(from_props)
+
+def add_range_into_dic(dic, name, start, end):
+  d = dic.get(name, None)
+  if d is None:
+    d = [(start, end)]
+    dic[name] = d
+  else:
+    d.append((start, end))
+
+def list_sub(a, b):
+  x = set(a) - set(b)
+  return list(x)
+
+def parse_properties(path):
+  with open(path, 'r') as f:
+    dic = { }
+    prop = None
+    props = []
+    for line in f:
+      s = line.strip()
+      if len(s) == 0:
+        continue
+
+      if s[0] == '#':
+        if VERSION_INFO is None:
+          check_version_info(s)
+
+      m = PR_LINE_REG.match(s)
+      if m:
+        prop = m.group(3)
+        if m.group(2):
+          start = int(m.group(1), 16)
+          end   = int(m.group(2), 16)
+          add_range_into_dic(dic, prop, start, end)
+        else:
+          start = int(m.group(1), 16)
+          add_range_into_dic(dic, prop, start, start)
+
+      elif PR_TOTAL_REG.match(s) is not None:
+        props.append(prop)
+
+  normalize_ranges_in_dic(dic)
+  return (dic, props)
+
+
+### main ###
+argv = sys.argv
+argc = len(argv)
+
+dic, props = parse_properties('WordBreakProperty.txt')
+merge_dic(DIC, dic)
+merge_props(PROPS, props)
+
+PROPS = sorted(PROPS)
+
+print '/* unicode_wb_data.c: Generated by make_unicode_wb_data.py. */'
+COPYRIGHT = '''
+/*-
+ * Copyright (c) 2019  K.Kosako  <kkosako0 AT gmail DOT com>
+ * All rights reserved.
+ *
+ * Redistribution and use in source and binary forms, with or without
+ * modification, are permitted provided that the following conditions
+ * are met:
+ * 1. Redistributions of source code must retain the above copyright
+ *    notice, this list of conditions and the following disclaimer.
+ * 2. Redistributions in binary form must reproduce the above copyright
+ *    notice, this list of conditions and the following disclaimer in the
+ *    documentation and/or other materials provided with the distribution.
+ *
+ * THIS SOFTWARE IS PROVIDED BY THE AUTHOR AND CONTRIBUTORS ``AS IS'' AND
+ * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+ * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
+ * ARE DISCLAIMED.  IN NO EVENT SHALL THE AUTHOR OR CONTRIBUTORS BE LIABLE
+ * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
+ * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS
+ * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION)
+ * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT
+ * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY
+ * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
+ * SUCH DAMAGE.
+ */
+'''.strip()
+
+print COPYRIGHT
+print ''
+if VERSION_INFO is not None:
+  print "#define WORD_BREAK_PROPERTY_VERSION  %s" % re.sub(r'[\.-]', '_', VERSION_INFO)
+  print ''
+
+ranges = []
+for prop in PROPS:
+  rs = DIC[prop]
+  for (start, end) in rs:
+    ranges.append((start, end, prop))
+
+ranges = sorted(ranges, key=lambda x: x[0])
+
+prev = -1
+for (start, end, prop) in ranges:
+  if prev >= start:
+    raise ValueError("{2}:{0} - {1} range overlap prev value {3}".format(start, end, prop, prev))
+
+
+print '/*'
+for prop in PROPS:
+  print "%s" % prop
+print '*/'
+print ''
+
+num_ranges = len(ranges)
+print "static int WB_RANGE_NUM = %d;" % num_ranges
+
+print 'static WB_RANGE_TYPE WB_RANGES[] = {'
+for i, (start, end, prop) in enumerate(ranges):
+  if i == num_ranges - 1:
+    comma = ''
+  else:
+    comma = ','
+
+  type_name = 'WB_' + prop
+  print " {0x%06x, 0x%06x, %s }%s" % (start, end, type_name, comma)
+
+print '};'
+
+sys.exit(0)