293 lines
12 KiB
Common Lisp
293 lines
12 KiB
Common Lisp
#!/usr/local/bin/clisp -C
|
|
|
|
;;; Creation of CLISP's uni_names.h from the UnicodeData.txt table.
|
|
;;; Bruno Haible 2000-12-28
|
|
|
|
(defparameter add-comments nil)
|
|
|
|
(defstruct unicode-char
|
|
(code nil :type integer)
|
|
(name nil :type string)
|
|
word-indices
|
|
word-indices-index
|
|
)
|
|
|
|
(defstruct word-list
|
|
(hashed nil :type hash-table)
|
|
(sorted nil :type list)
|
|
size ; number of characters total
|
|
length ; number of words
|
|
)
|
|
|
|
(defun main (inputfile outputfile)
|
|
(declare (type string inputfile outputfile))
|
|
#+UNICODE (setq *default-file-encoding* charset:utf-8)
|
|
(let ((all-chars '()))
|
|
;; Read all characters and names from the input file.
|
|
(with-open-file (istream inputfile :direction :input)
|
|
(loop
|
|
(let ((line (read-line istream nil nil)))
|
|
(unless line (return))
|
|
(let* ((i1 (position #\; line))
|
|
(i2 (position #\; line :start (1+ i1)))
|
|
(code-string (subseq line 0 i1))
|
|
(code (parse-integer code-string :radix 16))
|
|
(name-string (subseq line (1+ i1) i2)))
|
|
; Ignore characters whose name starts with "<".
|
|
(unless (eql (char name-string 0) #\<)
|
|
; Also ignore Hangul syllables; they are treated specially.
|
|
(unless (<= #xAC00 code #xD7A3)
|
|
; Also ignore CJK compatibility ideographs; they are treated
|
|
; specially as well.
|
|
(unless (or (<= #xF900 code #xFA2D) (<= #xFA30 code #xFA6A)
|
|
(<= #x2F800 code #x2FA1D))
|
|
; Transform the code so that it fits in 16 bits. In
|
|
; Unicode 3.1 the following ranges are used.
|
|
; 0x00000..0x033FF >>12= 0x00..0x03 -> 0x0..0x3
|
|
; 0x0A000..0x0A4FF >>12= 0x0A -> 0x4
|
|
; 0x0F900..0x0FFFF >>12= 0x0F -> 0x5
|
|
; 0x10300..0x104FF >>12= 0x10 -> 0x6
|
|
; 0x1D000..0x1D7DD >>12= 0x1D -> 0x7
|
|
; 0x2F800..0x2FAFF >>12= 0x2F -> 0x8
|
|
; 0xE0000..0xE00FF >>12= 0xE0 -> 0x9
|
|
(flet ((transform (x)
|
|
(dpb
|
|
(case (ash x -12)
|
|
((#x00 #x01 #x02 #x03) (ash x -12))
|
|
(#x0A 4)
|
|
(#x0F 5)
|
|
(#x10 6)
|
|
(#x1D 7)
|
|
(#x2F 8)
|
|
(#xE0 9)
|
|
(t (error "Update the transform function for 0x~5,'0X" x))
|
|
)
|
|
(byte 8 12)
|
|
x
|
|
)) )
|
|
(push (make-unicode-char :code (transform code)
|
|
:name name-string)
|
|
all-chars
|
|
) ) ) ) )
|
|
) ) ) )
|
|
(setq all-chars (nreverse all-chars))
|
|
;; Split into words.
|
|
(let ((words-by-length (make-array 0 :adjustable t)))
|
|
(dolist (name (list* "HANGUL SYLLABLE" "CJK COMPATIBILITY" (mapcar #'unicode-char-name all-chars)))
|
|
(let ((i1 0))
|
|
(loop
|
|
(when (>= i1 (length name)) (return))
|
|
(let ((i2 (or (position #\Space name :start i1) (length name))))
|
|
(let* ((word (subseq name i1 i2))
|
|
(len (length word)))
|
|
(when (>= len (length words-by-length))
|
|
(adjust-array words-by-length (1+ len))
|
|
)
|
|
(unless (aref words-by-length len)
|
|
(setf (aref words-by-length len)
|
|
(make-word-list
|
|
:hashed (make-hash-table :test #'equal)
|
|
:sorted '()
|
|
) ) )
|
|
(let ((word-list (aref words-by-length len)))
|
|
(unless (gethash word (word-list-hashed word-list))
|
|
(setf (gethash word (word-list-hashed word-list)) t)
|
|
(push word (word-list-sorted word-list))
|
|
) )
|
|
)
|
|
(setq i1 (1+ i2))
|
|
) ) ) )
|
|
;; Sort the word lists.
|
|
(dotimes (len (length words-by-length))
|
|
(unless (aref words-by-length len)
|
|
(setf (aref words-by-length len)
|
|
(make-word-list
|
|
:hashed (make-hash-table :test #'equal)
|
|
:sorted '()
|
|
) ) )
|
|
(let ((word-list (aref words-by-length len)))
|
|
(setf (word-list-sorted word-list)
|
|
(sort (word-list-sorted word-list) #'string<)
|
|
)
|
|
(setf (word-list-size word-list)
|
|
(reduce #'+ (mapcar #'length (word-list-sorted word-list)))
|
|
)
|
|
(setf (word-list-length word-list)
|
|
(length (word-list-sorted word-list))
|
|
) ) )
|
|
;; Output the tables.
|
|
(with-open-file (ostream outputfile :direction :output
|
|
#+UNICODE :external-format #+UNICODE charset:ascii)
|
|
(format ostream "/*~%")
|
|
(format ostream " * ~A~%" (file-namestring outputfile))
|
|
(format ostream " *~%")
|
|
(format ostream " * Unicode character name table.~%")
|
|
(format ostream " * Generated automatically by the gen-uninames utility.~%")
|
|
(format ostream " */~%")
|
|
(format ostream "~%")
|
|
(format ostream "static const char unicode_name_words[~D] = {~%"
|
|
(let ((sum 0))
|
|
(dotimes (len (length words-by-length))
|
|
(let ((word-list (aref words-by-length len)))
|
|
(incf sum (word-list-size word-list))
|
|
) )
|
|
sum
|
|
) )
|
|
(dotimes (len (length words-by-length))
|
|
(let ((word-list (aref words-by-length len)))
|
|
(dolist (word (word-list-sorted word-list))
|
|
(format ostream " ~{ '~C',~}~%" (coerce word 'list))
|
|
) ) )
|
|
(format ostream "};~%")
|
|
(format ostream "#define UNICODE_CHARNAME_NUM_WORDS ~D~%"
|
|
(let ((sum 0))
|
|
(dotimes (len (length words-by-length))
|
|
(let ((word-list (aref words-by-length len)))
|
|
(incf sum (word-list-length word-list))
|
|
) )
|
|
sum
|
|
) )
|
|
#| ; Redundant data
|
|
(format ostream "static const uint16_t unicode_name_word_offsets[~D] = {~%"
|
|
(let ((sum 0))
|
|
(dotimes (len (length words-by-length))
|
|
(let ((word-list (aref words-by-length len)))
|
|
(incf sum (word-list-length word-list))
|
|
) )
|
|
sum
|
|
) )
|
|
(dotimes (len (length words-by-length))
|
|
(let ((word-list (aref words-by-length len)))
|
|
(when (word-list-sorted word-list)
|
|
(format ostream " ")
|
|
(do ((l (word-list-sorted word-list) (cdr l))
|
|
(offset 0 (+ offset (length (car l)))))
|
|
((endp l))
|
|
(format ostream "~<~% ~0,79:; ~D,~>" offset)
|
|
)
|
|
(format ostream "~%")
|
|
) ) )
|
|
(format ostream "};~%")
|
|
|#
|
|
(format ostream "static const struct { uint16_t extra_offset; uint16_t ind_offset; } unicode_name_by_length[~D] = {~%"
|
|
(1+ (length words-by-length))
|
|
)
|
|
(let ((extra-offset 0)
|
|
(ind-offset 0))
|
|
(dotimes (len (length words-by-length))
|
|
(let ((word-list (aref words-by-length len)))
|
|
(format ostream " { ~D, ~D },~%" extra-offset ind-offset)
|
|
(incf extra-offset (word-list-size word-list))
|
|
(incf ind-offset (word-list-length word-list))
|
|
) )
|
|
(format ostream " { ~D, ~D }~%" extra-offset ind-offset)
|
|
)
|
|
(format ostream "};~%")
|
|
(let ((ind-offset 0))
|
|
(dotimes (len (length words-by-length))
|
|
(let ((word-list (aref words-by-length len)))
|
|
(dolist (word (word-list-sorted word-list))
|
|
(setf (gethash word (word-list-hashed word-list)) ind-offset)
|
|
(incf ind-offset)
|
|
) ) ) )
|
|
(dolist (word '("HANGUL" "SYLLABLE" "CJK" "COMPATIBILITY"))
|
|
(format ostream "#define UNICODE_CHARNAME_WORD_~A ~D~%" word
|
|
(gethash word (word-list-hashed (aref words-by-length (length word))))
|
|
) )
|
|
;; Compute the word-indices for every unicode-char.
|
|
(dolist (uc all-chars)
|
|
(let ((name (unicode-char-name uc))
|
|
(indices '()))
|
|
(let ((i1 0))
|
|
(loop
|
|
(when (>= i1 (length name)) (return))
|
|
(let ((i2 (or (position #\Space name :start i1) (length name))))
|
|
(let* ((word (subseq name i1 i2))
|
|
(len (length word)))
|
|
(push (gethash word (word-list-hashed (aref words-by-length len)))
|
|
indices
|
|
)
|
|
)
|
|
(setq i1 (1+ i2))
|
|
) ) )
|
|
(setf (unicode-char-word-indices uc)
|
|
(coerce (nreverse indices) 'vector)
|
|
)
|
|
) )
|
|
;; Sort the list of unicode-chars by word-indices.
|
|
(setq all-chars
|
|
(sort all-chars
|
|
(lambda (vec1 vec2)
|
|
(let ((len1 (length vec1))
|
|
(len2 (length vec2)))
|
|
(do ((i 0 (1+ i)))
|
|
(nil)
|
|
(if (< i len2)
|
|
(if (< i len1)
|
|
(cond ((< (aref vec1 i) (aref vec2 i)) (return t))
|
|
((> (aref vec1 i) (aref vec2 i)) (return nil))
|
|
)
|
|
(return t)
|
|
)
|
|
(return nil)
|
|
) ) ) )
|
|
:key #'unicode-char-word-indices
|
|
) )
|
|
;; Output the word-indices.
|
|
(format ostream "static const uint16_t unicode_names[~D] = {~%"
|
|
(reduce #'+ (mapcar (lambda (uc) (length (unicode-char-word-indices uc))) all-chars))
|
|
)
|
|
(let ((i 0))
|
|
(dolist (uc all-chars)
|
|
(format ostream " ~{ ~D,~}"
|
|
(maplist (lambda (r) (+ (* 2 (car r)) (if (cdr r) 1 0)))
|
|
(coerce (unicode-char-word-indices uc) 'list)
|
|
)
|
|
)
|
|
(when add-comments
|
|
(format ostream "~40T/* ~A */" (unicode-char-name uc))
|
|
)
|
|
(format ostream "~%")
|
|
(setf (unicode-char-word-indices-index uc) i)
|
|
(incf i (length (unicode-char-word-indices uc)))
|
|
) )
|
|
(format ostream "};~%")
|
|
(format ostream "static const struct { uint16_t code; uint16_t name; } unicode_name_to_code[~D] = {~%"
|
|
(length all-chars)
|
|
)
|
|
(dolist (uc all-chars)
|
|
(format ostream " { 0x~4,'0X, ~D },"
|
|
(unicode-char-code uc)
|
|
(unicode-char-word-indices-index uc)
|
|
)
|
|
(when add-comments
|
|
(format ostream "~21T/* ~A */" (unicode-char-name uc))
|
|
)
|
|
(format ostream "~%")
|
|
)
|
|
(format ostream "};~%")
|
|
(format ostream "static const struct { uint16_t code; uint16_t name; } unicode_code_to_name[~D] = {~%"
|
|
(length all-chars)
|
|
)
|
|
(dolist (uc (sort (copy-list all-chars) #'< :key #'unicode-char-code))
|
|
(format ostream " { 0x~4,'0X, ~D },"
|
|
(unicode-char-code uc)
|
|
(unicode-char-word-indices-index uc)
|
|
)
|
|
(when add-comments
|
|
(format ostream "~21T/* ~A */" (unicode-char-name uc))
|
|
)
|
|
(format ostream "~%")
|
|
)
|
|
(format ostream "};~%")
|
|
(format ostream "#define UNICODE_CHARNAME_MAX_LENGTH ~D~%"
|
|
(reduce #'max (mapcar (lambda (uc) (length (unicode-char-name uc))) all-chars))
|
|
)
|
|
(format ostream "#define UNICODE_CHARNAME_MAX_WORDS ~D~%"
|
|
(reduce #'max (mapcar (lambda (uc) (length (unicode-char-word-indices uc))) all-chars))
|
|
)
|
|
)
|
|
) ) )
|
|
|
|
(main (first *args*) (second *args*))
|