summaryrefslogtreecommitdiff
path: root/src/make_unicode_property_data.py
diff options
context:
space:
mode:
Diffstat (limited to 'src/make_unicode_property_data.py')
-rwxr-xr-xsrc/make_unicode_property_data.py145
1 files changed, 76 insertions, 69 deletions
diff --git a/src/make_unicode_property_data.py b/src/make_unicode_property_data.py
index 78ccb29..b0a8263 100755
--- a/src/make_unicode_property_data.py
+++ b/src/make_unicode_property_data.py
@@ -1,14 +1,15 @@
-#!/usr/bin/python
+#!/usr/bin/env python3
# -*- coding: utf-8 -*-
# make_unicode_property_data.py
-# Copyright (c) 2016-2021 K.Kosako
+# Copyright (c) 2016-2023 K.Kosako
import sys
import re
POSIX_LIST = [
- 'NEWLINE', 'Alpha', 'Blank', 'Cntrl', 'Digit', 'Graph', 'Lower',
- 'Print', 'Punct', 'Space', 'Upper', 'XDigit', 'Word', 'Alnum', 'ASCII'
+ 'NEWLINE', 'Alpha', 'Blank', 'Cntrl', 'Digit', 'Graph', 'Lower',
+ 'Print', 'PosixPunct', 'Space', 'Upper', 'XDigit', 'Word', 'Alnum',
+ 'ASCII'
]
MAX_CODE_POINT = 0x10ffff
@@ -45,31 +46,31 @@ def fix_block_name(name):
def print_ranges(ranges):
for (start, end) in ranges:
- print "0x%06x, 0x%06x" % (start, end)
+ print("0x%06x, 0x%06x" % (start, end))
- print len(ranges)
+ print(len(ranges))
def print_prop_and_index(prop, i):
- print "%-35s %3d" % (prop + ',', i)
+ print("%-35s %3d" % (prop + ',', i))
PropIndex[prop] = i
PRINT_CACHE = { }
def print_property(prop, data, desc):
- print ''
- print "/* PROPERTY: '%s': %s */" % (prop, desc)
+ print('')
+ print("/* PROPERTY: '%s': %s */" % (prop, desc))
prev_prop = dic_find_by_value(PRINT_CACHE, data)
if prev_prop is not None:
- print "#define CR_%s CR_%s" % (prop, prev_prop)
+ print("#define CR_%s CR_%s" % (prop, prev_prop))
else:
PRINT_CACHE[prop] = data
- print "static const OnigCodePoint"
- print "CR_%s[] = { %d," % (prop, len(data))
+ print("static const OnigCodePoint")
+ print("CR_%s[] = { %d," % (prop, len(data)))
for (start, end) in data:
- print "0x%04x, 0x%04x," % (start, end)
+ print("0x%04x, 0x%04x," % (start, end))
- print "}; /* END of CR_%s */" % prop
+ print("}; /* END of CR_%s */" % prop)
def dic_find_by_value(dic, v):
@@ -99,7 +100,7 @@ def normalize_ranges(in_ranges, sort=False):
r = []
prev = None
for (start, end) in ranges:
- if prev >= start - 1:
+ if prev is not None and prev >= start - 1:
(pstart, pend) = r.pop()
end = max(pend, end)
start = pstart
@@ -174,16 +175,19 @@ def merge_dic(to_dic, from_dic):
from_keys = from_dic.keys()
common = list(set(to_keys) & set(from_keys))
if len(common) != 0:
- print >> sys.stderr, "merge_dic: collision: %s" % sorted(common)
+ print("merge_dic: collision: %s" % sorted(common), file=sys.stderr)
to_dic.update(from_dic)
-def merge_props(to_props, from_props):
- common = list(set(to_props) & set(from_props))
+def merge_props(to_dic, from_dic):
+ to_keys = to_dic.keys()
+ from_keys = from_dic.keys()
+ common = list(set(to_keys) & set(from_keys))
if len(common) != 0:
- print >> sys.stderr, "merge_props: collision: %s" % sorted(common)
+ print("merge_props: collision: %s" % sorted(common), file=sys.stderr)
- to_props.extend(from_props)
+ for k in from_keys:
+ to_dic[k] = True
def add_range_into_dic(dic, name, start, end):
d = dic.get(name, None)
@@ -234,7 +238,6 @@ def parse_properties(path, klass, prop_prefix = None, version_reg = None):
with open(path, 'r') as f:
dic = { }
prop = None
- props = []
for line in f:
s = line.strip()
if len(s) == 0:
@@ -261,10 +264,9 @@ def parse_properties(path, klass, prop_prefix = None, version_reg = None):
elif PR_TOTAL_REG.match(s) is not None:
KDIC[prop] = klass
- props.append(prop)
normalize_ranges_in_dic(dic)
- return (dic, props, version_match)
+ return (dic, version_match)
def parse_property_aliases(path):
a = { }
@@ -384,7 +386,7 @@ def add_posix_props(dic):
dic['Alpha'] = dic['Alphabetic']
dic['Upper'] = dic['Uppercase']
dic['Lower'] = dic['Lowercase']
- dic['Punct'] = dic['P'] # P == Punctuation
+ dic['PosixPunct'] = add_ranges(dic['P'], dic['S']) # P == Punctuation
dic['Digit'] = dic['Nd']
dic['XDigit'] = [(0x0030, 0x0039), (0x0041, 0x0046), (0x0061, 0x0066)]
dic['Alnum'] = alnum
@@ -405,7 +407,7 @@ def set_max_prop_name(name):
def entry_prop_name(name, index):
set_max_prop_name(name)
if OUTPUT_LIST_MODE and index >= len(POSIX_LIST):
- print >> UPF, "%s" % (name)
+ print("%s" % (name), file=UPF)
def entry_and_print_prop_and_index(name, index):
entry_prop_name(name, index)
@@ -413,10 +415,10 @@ def entry_and_print_prop_and_index(name, index):
print_prop_and_index(nname, index)
def parse_and_merge_properties(path, klass, prop_prefix = None, version_reg = None):
- dic, props, ver_m = parse_properties(path, klass, prop_prefix, version_reg)
+ dic, ver_m = parse_properties(path, klass, prop_prefix, version_reg)
merge_dic(DIC, dic)
- merge_props(PROPS, props)
- return dic, props, ver_m
+ merge_props(PROPS, dic)
+ return dic, ver_m
### main ###
@@ -425,7 +427,7 @@ argc = len(argv)
COPYRIGHT = '''
/*-
- * Copyright (c) 2016-2021 K.Kosako
+ * Copyright (c) 2016-2023 K.Kosako
* All rights reserved.
*
* Redistribution and use in source and binary forms, with or without
@@ -461,7 +463,7 @@ for i in range(1, argc):
elif arg == '-gc':
INCLUDE_GRAPHEME_CLUSTER_DATA = True
else:
- print >> sys.stderr, "Invalid argument: %s" % arg
+ print("Invalid argument: %s" % arg, file=sys.stderr)
OUTPUT_LIST_MODE = not(POSIX_ONLY)
@@ -471,26 +473,26 @@ with open('UnicodeData.txt', 'r') as f:
DIC = dic
add_primitive_props(assigned)
-PROPS = DIC.keys()
-PROPS = list_sub(PROPS, POSIX_LIST)
+PROPS = DIC.fromkeys(DIC, True)
+PROPS = {k: v for k, v in PROPS.items() if k not in POSIX_LIST}
-_, _, ver_m = parse_and_merge_properties('DerivedCoreProperties.txt', 'Derived Property', None, UNICODE_VERSION_REG)
+_, ver_m = parse_and_merge_properties('DerivedCoreProperties.txt', 'Derived Property', None, UNICODE_VERSION_REG)
if ver_m is not None:
VERSION_INFO[0] = int(ver_m.group(1))
VERSION_INFO[1] = int(ver_m.group(2))
VERSION_INFO[2] = int(ver_m.group(3))
-dic, props, _ = parse_and_merge_properties('Scripts.txt', 'Script')
+dic, _ = parse_and_merge_properties('Scripts.txt', 'Script')
DIC['Unknown'] = inverse_ranges(add_ranges_in_dic(dic))
parse_and_merge_properties('PropList.txt', 'Binary Property')
-_, _, ver_m = parse_and_merge_properties('emoji-data.txt', 'Emoji Property', None, EMOJI_VERSION_REG)
+_, ver_m = parse_and_merge_properties('emoji-data.txt', 'Emoji Property', None, EMOJI_VERSION_REG)
if ver_m is not None:
EMOJI_VERSION_INFO[0] = int(ver_m.group(1))
EMOJI_VERSION_INFO[1] = int(ver_m.group(2))
-PROPS.append('Unknown')
+PROPS['Unknown'] = True
KDIC['Unknown'] = 'Script'
ALIASES = parse_property_aliases('PropertyAliases.txt')
@@ -501,34 +503,39 @@ dic, BLOCKS = parse_blocks('Blocks.txt')
merge_dic(DIC, dic)
if INCLUDE_GRAPHEME_CLUSTER_DATA:
- dic, props, _ = parse_properties('GraphemeBreakProperty.txt',
- 'GraphemeBreak Property',
- GRAPHEME_CLUSTER_BREAK_NAME_PREFIX)
+ dic, _ = parse_properties('GraphemeBreakProperty.txt',
+ 'GraphemeBreak Property',
+ GRAPHEME_CLUSTER_BREAK_NAME_PREFIX)
merge_dic(DIC, dic)
- merge_props(PROPS, props)
+ merge_props(PROPS, dic)
#prop = GRAPHEME_CLUSTER_BREAK_NAME_PREFIX + 'Other'
#DIC[prop] = inverse_ranges(add_ranges_in_dic(dic))
- #PROPS.append(prop)
+ #PROPS[prop] = True
#KDIC[prop] = 'GrapemeBreak Property'
add_posix_props(DIC)
-PROPS = sorted(PROPS)
+PROP_LIST = sorted(PROPS.keys())
s = '''%{
/* Generated by make_unicode_property_data.py. */
'''
-print s
-print COPYRIGHT
-print ''
+print(s)
+print(COPYRIGHT)
+print('')
for prop in POSIX_LIST:
- print_property(prop, DIC[prop], "POSIX [[:%s:]]" % prop)
+ if prop == 'PosixPunct':
+ desc = "POSIX [[:punct:]]"
+ else:
+ desc = "POSIX [[:%s:]]" % prop
+
+ print_property(prop, DIC[prop], desc)
-print ''
+print('')
if not(POSIX_ONLY):
- for prop in PROPS:
+ for prop in PROP_LIST:
klass = KDIC.get(prop, None)
if klass is None:
n = len(prop)
@@ -545,18 +552,18 @@ if not(POSIX_ONLY):
print_property(block, DIC[block], 'Block')
-print ''
-print "static const OnigCodePoint*\nconst CodeRanges[] = {"
+print('')
+print("static const OnigCodePoint*\nconst CodeRanges[] = {")
for prop in POSIX_LIST:
- print " CR_%s," % prop
+ print(" CR_%s," % prop)
if not(POSIX_ONLY):
- for prop in PROPS:
- print " CR_%s," % prop
+ for prop in PROP_LIST:
+ print(" CR_%s," % prop)
for prop in BLOCKS:
- print " CR_%s," % prop
+ print(" CR_%s," % prop)
s = '''};
@@ -579,8 +586,8 @@ if OUTPUT_LIST_MODE:
if EMOJI_VERSION_INFO[0] < 0:
raise RuntimeError("Emoji Version is not found")
- print >> UPF, "Unicode Properties (Unicode Version: %d.%d.%d, Emoji: %d.%d)" % (VERSION_INFO[0], VERSION_INFO[1], VERSION_INFO[2], EMOJI_VERSION_INFO[0], EMOJI_VERSION_INFO[1])
- print >> UPF, ''
+ print("Unicode Properties (Unicode Version: %d.%d.%d, Emoji: %d.%d)" % (VERSION_INFO[0], VERSION_INFO[1], VERSION_INFO[2], EMOJI_VERSION_INFO[0], EMOJI_VERSION_INFO[1]), file=UPF)
+ print('', file=UPF)
index = -1
for prop in POSIX_LIST:
@@ -588,20 +595,20 @@ for prop in POSIX_LIST:
entry_and_print_prop_and_index(prop, index)
if not(POSIX_ONLY):
- for prop in PROPS:
+ for prop in PROP_LIST:
index += 1
entry_and_print_prop_and_index(prop, index)
- NALIASES = map(lambda (k,v):(normalize_prop_name(k), k, v), ALIASES.items())
+ NALIASES = map(lambda x:(normalize_prop_name(x[0]), x[0], x[1]), ALIASES.items())
NALIASES = sorted(NALIASES)
for (nk, k, v) in NALIASES:
nv = normalize_prop_name(v)
if PropIndex.get(nk, None) is not None:
- print >> sys.stderr, "ALIASES: already exists: %s => %s" % (k, v)
+ print("ALIASES: already exists: %s => %s" % (k, v), file=sys.stderr)
continue
aindex = PropIndex.get(nv, None)
if aindex is None:
- #print >> sys.stderr, "ALIASES: value is not exist: %s => %s" % (k, v)
+ #print("ALIASES: value is not exist: %s => %s" % (k, v), file=sys.stderr)
continue
entry_prop_name(k, aindex)
@@ -611,26 +618,26 @@ if not(POSIX_ONLY):
index += 1
entry_and_print_prop_and_index(name, index)
-print '%%'
-print ''
+print('%%')
+print('')
if not(POSIX_ONLY):
if VERSION_INFO[0] < 0:
raise RuntimeError("Unicode Version is not found")
if EMOJI_VERSION_INFO[0] < 0:
raise RuntimeError("Emoji Version is not found")
- print "#define UNICODE_PROPERTY_VERSION %02d%02d%02d" % (VERSION_INFO[0], VERSION_INFO[1], VERSION_INFO[2])
- print "#define UNICODE_EMOJI_VERSION %02d%02d" % (EMOJI_VERSION_INFO[0], EMOJI_VERSION_INFO[1])
- print ''
+ print("#define UNICODE_PROPERTY_VERSION %02d%02d%02d" % (VERSION_INFO[0], VERSION_INFO[1], VERSION_INFO[2]))
+ print("#define UNICODE_EMOJI_VERSION %02d%02d" % (EMOJI_VERSION_INFO[0], EMOJI_VERSION_INFO[1]))
+ print('')
-print "#define PROPERTY_NAME_MAX_SIZE %d" % (PROPERTY_NAME_MAX_LEN + 10)
-print "#define CODE_RANGES_NUM %d" % (index + 1)
+print("#define PROPERTY_NAME_MAX_SIZE %d" % (PROPERTY_NAME_MAX_LEN + 10))
+print("#define CODE_RANGES_NUM %d" % (index + 1))
index_props = make_reverse_dic(PropIndex)
-print ''
+print('')
for i in range(index + 1):
for p in index_props[i]:
- print "#define PROP_INDEX_%s %d" % (p.upper(), i)
+ print("#define PROP_INDEX_%s %d" % (p.upper(), i))
if OUTPUT_LIST_MODE:
UPF.close()