summaryrefslogtreecommitdiff
path: root/lib/uniname/gen-uninames.lisp
diff options
context:
space:
mode:
Diffstat (limited to 'lib/uniname/gen-uninames.lisp')
-rwxr-xr-xlib/uniname/gen-uninames.lisp139
1 files changed, 54 insertions, 85 deletions
diff --git a/lib/uniname/gen-uninames.lisp b/lib/uniname/gen-uninames.lisp
index 9f79562..d08e93f 100755
--- a/lib/uniname/gen-uninames.lisp
+++ b/lib/uniname/gen-uninames.lisp
@@ -6,18 +6,12 @@
(defparameter add-comments nil)
(defstruct unicode-char
- (index nil :type integer)
+ (code nil :type integer)
(name nil :type string)
word-indices
word-indices-index
)
-(defstruct range
- (index nil :type integer)
- (start-code nil :type integer)
- (end-code nil :type integer)
-)
-
(defstruct word-list
(hashed nil :type hash-table)
(sorted nil :type list)
@@ -25,16 +19,10 @@
length ; number of words
)
-(defun main (inputfile outputfile aliasfile)
- (declare (type string inputfile outputfile aliasfile))
+(defun main (inputfile outputfile)
+ (declare (type string inputfile outputfile))
#+UNICODE (setq *default-file-encoding* charset:utf-8)
- (let ((all-chars '())
- (all-chars-hashed (make-hash-table :test #'equal))
- (all-aliases '())
- all-chars-and-aliases
- (all-ranges '())
- (name-index 0)
- range)
+ (let ((all-chars '()))
;; Read all characters and names from the input file.
(with-open-file (istream inputfile :direction :input)
(loop
@@ -53,53 +41,43 @@
; specially as well.
(unless (or (<= #xF900 code #xFA2D) (<= #xFA30 code #xFA6A)
(<= #xFA70 code #xFAD9) (<= #x2F800 code #x2FA1D))
- ;; Also ignore variationselectors; they are treated
- ;; specially as well.
- (unless (or (<= #xFE00 code #xFE0F) (<= #xE0100 code #xE01EF))
- (push (make-unicode-char :index name-index
+ ; Transform the code so that it fits in 16 bits. In
+ ; Unicode 5.1 the following ranges are used.
+ ; 0x00000..0x04DFF >>12= 0x00..0x04 -> 0x0..0x4
+ ; 0x0A000..0x0AAFF >>12= 0x0A -> 0x5
+ ; 0x0F900..0x0FFFF >>12= 0x0F -> 0x6
+ ; 0x10000..0x10A58 >>12= 0x10 -> 0x7
+ ; 0x12000..0x12473 >>12= 0x12 -> 0x8
+ ; 0x1D000..0x1D7FF >>12= 0x1D -> 0x9
+ ; 0x1F000..0x1F093 >>12= 0x1F -> 0xA
+ ; 0x2F800..0x2FAFF >>12= 0x2F -> 0xB
+ ; 0xE0000..0xE00FF >>12= 0xE0 -> 0xC
+ (flet ((transform (x)
+ (dpb
+ (case (ash x -12)
+ ((#x00 #x01 #x02 #x03 #x04) (ash x -12))
+ (#x0A 5)
+ (#x0F 6)
+ (#x10 7)
+ (#x12 8)
+ (#x1D 9)
+ (#x1F #xA)
+ (#x2F #xB)
+ (#xE0 #xC)
+ (t (error "Update the transform function for 0x~5,'0X" x))
+ )
+ (byte 8 12)
+ x
+ )) )
+ (push (make-unicode-char :code (transform code)
:name name-string)
- all-chars)
- (setf (gethash code all-chars-hashed) (car all-chars))
- ;; Update the contiguous range, or start a new range.
- (if (and range (= (1+ (range-end-code range)) code))
- (setf (range-end-code range) code)
- (progn
- (when range
- (push range all-ranges))
- (setq range (make-range :index name-index
- :start-code code
- :end-code code))))
- (incf name-index)
- (setq last-code code)
- ) ) ) )
+ all-chars
+ ) ) ) ) )
) ) ) )
(setq all-chars (nreverse all-chars))
- (if range
- (push range all-ranges))
- (setq all-ranges (nreverse all-ranges))
- (when aliasfile
- ;; Read all characters and names from the alias file.
- (with-open-file (istream aliasfile :direction :input)
- (loop
- (let ((line (read-line istream nil nil)))
- (unless line (return))
- (let* ((i1 (position #\; line))
- (i2 (position #\; line :start (1+ i1)))
- (code-string (subseq line 0 i1))
- (code (parse-integer code-string :radix 16))
- (name-string (subseq line (1+ i1) i2))
- (uc (gethash code all-chars-hashed)))
- (when uc
- (push (make-unicode-char :index (unicode-char-index uc)
- :name name-string)
- all-aliases)
- ) ) ) ) ) )
- (setq all-aliases (nreverse all-aliases)
- all-chars-and-aliases (append all-chars all-aliases))
;; Split into words.
(let ((words-by-length (make-array 0 :adjustable t)))
- (dolist (name (list* "HANGUL SYLLABLE" "CJK COMPATIBILITY" "VARIATION"
- (mapcar #'unicode-char-name all-chars-and-aliases)))
+ (dolist (name (list* "HANGUL SYLLABLE" "CJK COMPATIBILITY" (mapcar #'unicode-char-name all-chars)))
(let ((i1 0))
(loop
(when (>= i1 (length name)) (return))
@@ -217,12 +195,12 @@
(setf (gethash word (word-list-hashed word-list)) ind-offset)
(incf ind-offset)
) ) ) )
- (dolist (word '("HANGUL" "SYLLABLE" "CJK" "COMPATIBILITY" "VARIATION"))
+ (dolist (word '("HANGUL" "SYLLABLE" "CJK" "COMPATIBILITY"))
(format ostream "#define UNICODE_CHARNAME_WORD_~A ~D~%" word
(gethash word (word-list-hashed (aref words-by-length (length word))))
) )
;; Compute the word-indices for every unicode-char.
- (dolist (uc all-chars-and-aliases)
+ (dolist (uc all-chars)
(let ((name (unicode-char-name uc))
(indices '()))
(let ((i1 0))
@@ -242,8 +220,8 @@
)
) )
;; Sort the list of unicode-chars by word-indices.
- (setq all-chars-and-aliases
- (sort all-chars-and-aliases
+ (setq all-chars
+ (sort all-chars
(lambda (vec1 vec2)
(let ((len1 (length vec1))
(len2 (length vec2)))
@@ -262,10 +240,10 @@
) )
;; Output the word-indices.
(format ostream "static const uint16_t unicode_names[~D] = {~%"
- (reduce #'+ (mapcar (lambda (uc) (length (unicode-char-word-indices uc))) all-chars-and-aliases))
+ (reduce #'+ (mapcar (lambda (uc) (length (unicode-char-word-indices uc))) all-chars))
)
(let ((i 0))
- (dolist (uc all-chars-and-aliases)
+ (dolist (uc all-chars)
(format ostream " ~{ ~D,~}"
(maplist (lambda (r) (+ (* 2 (car r)) (if (cdr r) 1 0)))
(coerce (unicode-char-word-indices uc) 'list)
@@ -279,14 +257,14 @@
(incf i (length (unicode-char-word-indices uc)))
) )
(format ostream "};~%")
- (format ostream "static const struct { uint16_t index; uint32_t name:24; }~%")
+ (format ostream "static const struct { uint16_t code; uint32_t name:24; }~%")
(format ostream "#if __GNUC__ > 2 || (__GNUC__ == 2 && __GNUC_MINOR__ >= 7)~%__attribute__((__packed__))~%#endif~%")
- (format ostream "unicode_name_to_index[~D] = {~%"
- (length all-chars-and-aliases)
+ (format ostream "unicode_name_to_code[~D] = {~%"
+ (length all-chars)
)
- (dolist (uc all-chars-and-aliases)
+ (dolist (uc all-chars)
(format ostream " { 0x~4,'0X, ~D },"
- (unicode-char-index uc)
+ (unicode-char-code uc)
(unicode-char-word-indices-index uc)
)
(when add-comments
@@ -295,14 +273,14 @@
(format ostream "~%")
)
(format ostream "};~%")
- (format ostream "static const struct { uint16_t index; uint32_t name:24; }~%")
+ (format ostream "static const struct { uint16_t code; uint32_t name:24; }~%")
(format ostream "#if __GNUC__ > 2 || (__GNUC__ == 2 && __GNUC_MINOR__ >= 7)~%__attribute__((__packed__))~%#endif~%")
- (format ostream "unicode_index_to_name[~D] = {~%"
+ (format ostream "unicode_code_to_name[~D] = {~%"
(length all-chars)
)
- (dolist (uc (sort (copy-list all-chars) #'< :key #'unicode-char-index))
+ (dolist (uc (sort (copy-list all-chars) #'< :key #'unicode-char-code))
(format ostream " { 0x~4,'0X, ~D },"
- (unicode-char-index uc)
+ (unicode-char-code uc)
(unicode-char-word-indices-index uc)
)
(when add-comments
@@ -312,21 +290,12 @@
)
(format ostream "};~%")
(format ostream "#define UNICODE_CHARNAME_MAX_LENGTH ~D~%"
- (reduce #'max (mapcar (lambda (uc) (length (unicode-char-name uc))) all-chars-and-aliases))
+ (reduce #'max (mapcar (lambda (uc) (length (unicode-char-name uc))) all-chars))
)
(format ostream "#define UNICODE_CHARNAME_MAX_WORDS ~D~%"
- (reduce #'max (mapcar (lambda (uc) (length (unicode-char-word-indices uc))) all-chars-and-aliases))
+ (reduce #'max (mapcar (lambda (uc) (length (unicode-char-word-indices uc))) all-chars))
)
- (format ostream "static const struct { uint16_t index; uint32_t gap; uint16_t length; } unicode_ranges[~D] = {~%"
- (length all-ranges))
- (dolist (range all-ranges)
- (format ostream " { ~D, ~D, ~D },~%"
- (range-index range)
- (- (range-start-code range) (range-index range))
- (1+ (- (range-end-code range) (range-start-code range))))
- )
- (format ostream "};~%")
)
) ) )
-(main (first *args*) (second *args*) (third *args*))
+(main (first *args*) (second *args*))