emacs-devel
[Top][All Lists]
Advanced

[Date Prev][Date Next][Thread Prev][Thread Next][Date Index][Thread Index]

Re: idn.el and confusables.txt


From: Kenichi Handa
Subject: Re: idn.el and confusables.txt
Date: Fri, 17 Jun 2011 17:15:23 +0900

In article <address@hidden>, handa <address@hidden> writes:

> But, at first, I'll work on improving C interface to
> uni-*.el.

Attached is the first version for that.  It provides two C
functions (excerpt from chartab.c).

/* Unicode character property

   This section provides a convenient and efficient way to get a
   Unicode character property from C code (from Lisp, you can use
   get-char-code-property).

   The typical usage is to get a char-table for a specific property at
   a proper initialization time as this:

        Lisp_Object bidi_class_table = uniprop_table (intern ("bidi-class"));

   and get a property value for character CH as this:

        Lisp_Object bidi_class = uniprop_table_lookup (CH, bidi_class_table);

   In this case, what you actually get is an index number to the
   vector of property values (symbols nil, L, R, etc).  See the
   comment of uniprop_table_lookup for the detail.  */

Eli, I arranged that bidi_class got as above is an integer
value that is the same as enum bidi_type_t defined in
dispextern.h.  And, updata_table (intern "mirroring") gives
a char-table for mirroring character.

Could you check if the attached patch gives sufficient
facility to be used in the bidi code?

---
Kenichi Handa
address@hidden

=== modified file 'admin/ChangeLog'
--- admin/ChangeLog     2011-04-10 16:33:22 +0000
+++ admin/ChangeLog     2011-06-17 07:27:55 +0000
@@ -1,3 +1,29 @@
+2011-06-17  Kenichi Handa  <address@hidden>
+
+       * unidata/unidata-gen.el (unidata-dir): New variable.
+       (unidata-setup-list): Expand unidata-text-file in unidata-dir.
+       (unidata-prop-alist): INDEX element may be a function.  New
+       optional element VAL-LIST (for general-category and bidi-class).
+       New entry `mirroring'.
+       (unidata-get-character): Adjusted for the new compression format
+       of char-table element.
+       (unidata-gen-table-character): New arg IGNORE.  Adjusted for the
+       above changes.
+       (unidata-encode-val): Assume the first element of VAL-LIST is (nil
+       . 0).
+       (unidata-gen-table): Change argument DEFAULT-VALUE to VAL-LIST.
+       (unidata-gen-table-symbol): New arg VAL-LIST.
+       (unidata-gen-table-integer): Likewise.
+       (unidata-gen-table-numeric): Likewise.
+       (unidata-gen-table-name): New arg IGNORE.
+       (unidata-gen-table-decomposition): Likewise.
+       (unidata-gen-mirroring-list): New funciton.
+       (unidata-gen-files): New arg DATA-DIR.  Adjusted for the change of
+       unidata-prop-alist.
+
+       * unidata/Makefile.in (${DSTDIR}/charprop.el): New arg to
+       unidata-gen-files.
+
 2011-03-07  Chong Yidong  <address@hidden>
 
        * Version 23.3 released.

=== modified file 'admin/unidata/Makefile.in'
--- admin/unidata/Makefile.in   2011-01-14 17:18:41 +0000
+++ admin/unidata/Makefile.in   2011-06-16 03:46:42 +0000
@@ -33,9 +33,10 @@
 
 ${DSTDIR}/charprop.el: unidata-gen.elc unidata.txt
        ELC=`/bin/pwd`/unidata-gen.elc; \
-       DATA=`/bin/pwd`/unidata.txt; \
+       DATADIR=`/bin/pwd`; \
+       DATA=unidata.txt; \
        cd ${DSTDIR}; \
-       ${RUNEMACS} -batch --load $${ELC} -f unidata-gen-files $${DATA}
+       ${RUNEMACS} -batch --load $${ELC} -f unidata-gen-files $${DATADIR} 
$${DATA}
 
 ../../src/biditype.h: UnicodeData.txt
        gawk -F";" -f biditype.awk $< > $@

=== modified file 'admin/unidata/unidata-gen.el'
--- admin/unidata/unidata-gen.el        2011-01-14 17:18:41 +0000
+++ admin/unidata/unidata-gen.el        2011-06-17 07:07:19 +0000
@@ -41,7 +41,7 @@
 ;;   uni-name.el, uni-category.el, uni-combining.el, uni-bidi.el,
 ;;   uni-decomposition.el, uni-decimal.el, uni-digit.el, uni-numeric.el,
 ;;   uni-mirrored.el, uni-old-name.el, uni-comment.el, uni-uppercase.el,
-;;   uni-lowercase.el, uni-titlecase.el
+;;   uni-lowercase.el, uni-titlecase.el, uni-mirroring.el
 ;;     They each contain a single form of this format:
 ;;       (char-code-property-register PROP CHAR-TABLE)
 ;;     where PROP is the same as above, and CHAR-TABLE is a
@@ -49,8 +49,9 @@
 ;;
 ;;   When they are installed in .../lisp/international/, the file
 ;;   "charprop.el" is preloaded in loadup.el.  The other files are
-;;   automatically loaded when the functions `get-char-code-property'
-;;   and `put-char-code-property' are called.
+;;   automatically loaded when the Lisp functions
+;;   `get-char-code-property' and `put-char-code-property', and C
+;;   function uniprop_table are called.
 ;;
 ;; FORMAT OF A CHAR TABLE
 ;;
@@ -70,7 +71,8 @@
 
 ;;   The char table has four extra slots:
 ;;      1st: property symbol
-;;     2nd: function to call to get a property value
+;;     2nd: function to call to get a property value,
+;;          or an index number of C function to uncompress the data
 ;;     3nd: function to call to put a property value
 ;;     4th: function to call to get a description of a property value
 ;;     5th: data referred by the above functions
@@ -82,6 +84,11 @@
 
 (defvar unidata-list nil)
 
+;; Name of the directory containing files of Unicode Character
+;; Database.
+
+(defvar unidata-dir nil)
+
 (defun unidata-setup-list (unidata-text-file)
   (let* ((table (list nil))
         (tail table)
@@ -90,6 +97,7 @@
                        ("^<.*Surrogate" . nil)
                        ("^<.*Private Use" . PRIVATE\ USE)))
         val char name)
+    (setq unidata-text-file (expand-file-name unidata-text-file unidata-dir))
     (or (file-readable-p unidata-text-file)
        (error "File not readable: %s" unidata-text-file))
     (with-temp-buffer
@@ -136,10 +144,13 @@
 ;; Alist of this form:
 ;;   (PROP INDEX GENERATOR FILENAME)
 ;; PROP: character property
-;; INDEX: index to each element of unidata-list for PROP
+;; INDEX: index to each element of unidata-list for PROP.
+;;   It may be a function that generates an alist of character codes
+;;   vs. the corresponding property values.
 ;; GENERATOR: function to generate a char-table
 ;; FILENAME: filename to store the char-table
 ;; DESCRIBER: function to call to get a description string of property value
+;; VAL-LIST: list of specially ordered property values
 
 (defconst unidata-prop-alist
   '((name
@@ -152,7 +163,9 @@
 Property value is one of the following symbols:
   Lu, Ll, Lt, Lm, Lo, Mn, Mc, Me, Nd, Nl, No, Pc, Pd, Ps, Pe, Pi, Pf, Po,
   Sm, Sc, Sk, So, Zs, Zl, Zp, Cc, Cf, Cs, Co, Cn"
-     unidata-describe-general-category)
+     unidata-describe-general-category
+     (Lu Ll Lt Lm Lo Mn Mc Me Nd Nl No Pc Pd Ps Pe Pi Pf Po
+        Sm Sc Sk So Zs Zl Zp Cc Cf Cs Co Cn))
     (canonical-combining-class
      3 unidata-gen-table-integer "uni-combining.el"
      "Unicode canonical combining class.
@@ -164,7 +177,8 @@
 Property value is one of the following symbols:
   L, LRE, LRO, R, AL, RLE, RLO, PDF, EN, ES, ET,
   AN, CS, NSM, BN, B, S, WS, ON"
-     unidata-describe-bidi-class)
+     unidata-describe-bidi-class
+     (L R EN AN BN B AL LRE LRO RLE RLO PDF ES ET CS NSM S WS ON))
     (decomposition
      5 unidata-gen-table-decomposition "uni-decomposition.el"
      "Unicode decomposition mapping.
@@ -188,7 +202,7 @@
     (mirrored
      9 unidata-gen-table-symbol "uni-mirrored.el"
      "Unicode bidi mirrored flag.
-Property value is a symbol `Y' or `N'.")
+Property value is a symbol `Y' or `N'.  See also the property `mirroring'.")
     (old-name
      10 unidata-gen-table-name "uni-old-name.el"
      "Unicode old names as published in Unicode 1.0.
@@ -211,7 +225,12 @@
      14 unidata-gen-table-character "uni-titlecase.el"
      "Unicode simple titlecase mapping.
 Property value is a character."
-     string)))
+     string)
+    (mirroring
+     unidata-gen-mirroring-list unidata-gen-table-character "uni-mirroring.el"
+     "Unicode bidi-mirroring characters.
+Property value is a character that has the corresponding mirroring image,
+or nil for non-mirrored character.")))
 
 ;; Functions to access the above data.
 (defsubst unidata-prop-index (prop) (nth 1 (assq prop unidata-prop-alist)))
@@ -219,6 +238,7 @@
 (defsubst unidata-prop-file (prop) (nth 3 (assq prop unidata-prop-alist)))
 (defsubst unidata-prop-docstring (prop) (nth 4 (assq prop unidata-prop-alist)))
 (defsubst unidata-prop-describer (prop) (nth 5 (assq prop unidata-prop-alist)))
+(defsubst unidata-prop-val-list (prop) (nth 6 (assq prop unidata-prop-alist)))
 
 
 ;; SIMPLE TABLE
@@ -227,14 +247,18 @@
 ;; values of succeeding character codes are usually different, we use
 ;; a char-table described here to store such values.
 ;;
-;; If succeeding 128 characters has no property, a char-table has the
-;; symbol t for them.  Otherwise a char-table has a string of the
-;; following format for them.
+;; A char-table divides character code space (#x0..#x3FFFFF) into
+;; #x8000 blocks (each block contains 128 characters).
+
+;; If all characters of a block have no property, a char-table has the
+;; symbol nil for that block.  Otherwise a char-table has a string of
+;; the following format for it.
 ;;
-;; The first character of the string is FIRST-INDEX.
-;; The Nth (N > 0) character of the string is a property value of the
-;; character (BLOCK-HEAD + FIRST-INDEX + N - 1), where BLOCK-HEAD is
-;; the first of the characters in the block.
+;; The first character of the string is '\0'.
+;; The second character of the string is FIRST-INDEX.
+;; The Nth (N > 1) character of the string is a property value of the
+;; character (BLOCK-HEAD + FIRST-INDEX + N - 2), where BLOCK-HEAD is
+;; the first character of the block.
 ;;
 ;; The 4th extra slot of a char-table is nil.
 
@@ -247,9 +271,9 @@
     (let* ((len (length val))
           (block-head (lsh (lsh char -7) 7))
           (vec (make-vector 128 nil))
-          (first-index (aref val 0)))
+          (first-index (aref val 1)))
       (dotimes (i (1- len))
-       (let ((elt (aref val (1+ i))))
+       (let ((elt (aref val (+ 2 i))))
          (if (> elt 0)
              (aset vec (+ first-index i) elt))))
       (dotimes (i 128)
@@ -266,13 +290,15 @@
          (funcall (char-table-extra-slot table 1) char current-val table))
       (aset table char val))))
 
-(defun unidata-gen-table-character (prop)
+(defun unidata-gen-table-character (prop ignore)
   (let ((table (make-char-table 'char-code-property-table))
        (prop-idx (unidata-prop-index prop))
        (vec (make-vector 128 0))
        (tail unidata-list)
        elt range val idx slot)
-    (set-char-table-range table (cons 0 (max-char)) t)
+    (if (functionp prop-idx)
+       (setq tail (funcall prop-idx)
+             prop-idx 1))
     (while tail
       (setq elt (car tail) tail (cdr tail))
       (setq range (car elt)
@@ -301,7 +327,7 @@
                  (setq first-index last-index)))
            (setq tail (cdr tail)))
          (when first-index
-           (let ((str (string first-index))
+           (let ((str (string 0 first-index))
                  c)
              (while (<= first-index last-index)
                (setq str (format "%s%c"  str (or (aref vec first-index) 0))
@@ -311,7 +337,7 @@
     (set-char-table-extra-slot table 0 prop)
     (byte-compile 'unidata-get-character)
     (byte-compile 'unidata-put-character)
-    (set-char-table-extra-slot table 1 (symbol-function 
'unidata-get-character))
+    (set-char-table-extra-slot table 1 0)
     (set-char-table-extra-slot table 2 (symbol-function 
'unidata-put-character))
 
     table))
@@ -463,30 +489,34 @@
 ;; Encode the character property value VAL into an integer value by
 ;; VAL-LIST.  By side effect, VAL-LIST is modified.
 ;; VAL-LIST has this form:
-;;   (t (VAL1 . VAL-CODE1) (VAL2 . VAL-CODE2) ...)
-;; If VAL is one of VALn, just return VAL-CODEn.  Otherwise,
-;; VAL-LIST is modified to this:
-;;   (t (VAL . (1+ VAL-CODE1)) (VAL1 . VAL-CODE1) (VAL2 . VAL-CODE2) ...)
+;;   ((nil . 0) (VAL1 . 1) (VAL2 . 2) ...)
+;; If VAL is one of VALn, just return n.
+;; Otherwise, VAL-LIST is modified to this:
+;;   ((nil . 0) (VAL1 . 1) (VAL2 . 2) ... (VAL . n+1))
 
 (defun unidata-encode-val (val-list val)
   (let ((slot (assoc val val-list))
        val-code)
     (if slot
        (cdr slot)
-      (setq val-code (if (cdr val-list) (1+ (cdr (nth 1 val-list))) 1))
-      (setcdr val-list (cons (cons val val-code) (cdr val-list)))
+      (setq val-code (length val-list))
+      (nconc val-list (list (cons val val-code)))
       val-code)))
 
 ;; Generate a char-table for the character property PROP.
 
-(defun unidata-gen-table (prop val-func default-value)
+(defun unidata-gen-table (prop val-func val-list)
   (let ((table (make-char-table 'char-code-property-table))
        (prop-idx (unidata-prop-index prop))
-       (val-list (list t))
        (vec (make-vector 128 0))
        tail elt range val val-code idx slot
        prev-range-data)
-    (set-char-table-range table (cons 0 (max-char)) default-value)
+    (setq val-list (cons nil (copy-sequence val-list)))
+    (setq tail val-list val-code 0)
+    ;; Convert (nil A B ...) to ((nil . 0) (A . 1) (B . 2) ...)
+    (while tail
+      (setcar tail (cons (car tail) val-code))
+      (setq tail (cdr tail) val-code (1+ val-code)))
     (setq tail unidata-list)
     (while tail
       (setq elt (car tail) tail (cdr tail))
@@ -534,7 +564,7 @@
            (if val-code
                (aset vec (- range start) val-code))
            (setq tail (cdr tail)))
-         (setq str "" val-code -1 count 0)
+         (setq str "\000" val-code -1 count 0)
          (mapc #'(lambda (x)
                    (if (= val-code x)
                        (setq count (1+ count))
@@ -559,34 +589,33 @@
                  (setq str (concat str (string val-code)))))
              (set-char-table-range table (cons start limit) str))))))
 
-    (setq val-list (nreverse (cdr val-list)))
     (set-char-table-extra-slot table 0 prop)
     (set-char-table-extra-slot table 4 (vconcat (mapcar 'car val-list)))
     table))
 
-(defun unidata-gen-table-symbol (prop)
+(defun unidata-gen-table-symbol (prop val-list)
   (let ((table (unidata-gen-table prop
                                  #'(lambda (x) (and (> (length x) 0)
                                                     (intern x)))
-                                 0)))
+                                 val-list)))
     (byte-compile 'unidata-get-symbol)
     (byte-compile 'unidata-put-symbol)
-    (set-char-table-extra-slot table 1 (symbol-function 'unidata-get-symbol))
+    (set-char-table-extra-slot table 1 1)
     (set-char-table-extra-slot table 2 (symbol-function 'unidata-put-symbol))
     table))
 
-(defun unidata-gen-table-integer (prop)
+(defun unidata-gen-table-integer (prop val-list)
   (let ((table (unidata-gen-table prop
                                  #'(lambda (x) (and (> (length x) 0)
                                                     (string-to-number x)))
-                                 t)))
+                                 val-list)))
     (byte-compile 'unidata-get-integer)
     (byte-compile 'unidata-put-integer)
-    (set-char-table-extra-slot table 1 (symbol-function 'unidata-get-integer))
+    (set-char-table-extra-slot table 1 1)
     (set-char-table-extra-slot table 2 (symbol-function 'unidata-put-integer))
     table))
 
-(defun unidata-gen-table-numeric (prop)
+(defun unidata-gen-table-numeric (prop val-list)
   (let ((table (unidata-gen-table prop
                                  #'(lambda (x)
                                      (if (string-match "/" x)
@@ -595,10 +624,10 @@
                                              (substring x (match-end 0))))
                                        (if (> (length x) 0)
                                            (string-to-number x))))
-                                 t)))
+                                 val-list)))
     (byte-compile 'unidata-get-numeric)
     (byte-compile 'unidata-put-numeric)
-    (set-char-table-extra-slot table 1 (symbol-function 'unidata-get-numeric))
+    (set-char-table-extra-slot table 1 1)
     (set-char-table-extra-slot table 2 (symbol-function 'unidata-put-numeric))
     table))
 
@@ -1025,7 +1054,7 @@
                      idx (1+ i)))))
        (nreverse (cons (intern (substring str idx)) l))))))
 
-(defun unidata-gen-table-name (prop)
+(defun unidata-gen-table-name (prop ignore)
   (let* ((table (unidata-gen-table-word-list prop 'unidata-split-name))
         (word-tables (char-table-extra-slot table 4)))
     (byte-compile 'unidata-get-name)
@@ -1064,7 +1093,7 @@
        (nreverse l)))))
 
 
-(defun unidata-gen-table-decomposition (prop)
+(defun unidata-gen-table-decomposition (prop ignore)
   (let* ((table (unidata-gen-table-word-list prop 
'unidata-split-decomposition))
         (word-tables (char-table-extra-slot table 4)))
     (byte-compile 'unidata-get-decomposition)
@@ -1171,6 +1200,19 @@
                 (string ?'))))
    val " "))
 
+(defun unidata-gen-mirroring-list ()
+  (let ((head (list nil))
+       tail)
+    (with-temp-buffer
+      (insert-file-contents (expand-file-name "BidiMirroring.txt" unidata-dir))
+      (goto-char (point-min))
+      (setq tail head)
+      (while (re-search-forward "^\\([0-9A-F]+\\);\\s +\\([0-9A-F]+\\)" nil t)
+       (let ((char (string-to-number (match-string 1) 16))
+             (mirror (match-string 2)))
+         (setq tail (setcdr tail (list (list char mirror)))))))
+    (cdr head)))
+
 ;; Verify if we can retrieve correct values from the generated
 ;; char-tables.
 
@@ -1212,13 +1254,16 @@
 ;; The entry function.  It generates files described in the header
 ;; comment of this file.
 
-(defun unidata-gen-files (&optional unidata-text-file)
-  (or unidata-text-file
-      (setq unidata-text-file (car command-line-args-left)
+(defun unidata-gen-files (&optional data-dir unidata-text-file)
+  (or data-dir
+      (setq data-dir (car command-line-args-left)
+           command-line-args-left (cdr command-line-args-left)
+           unidata-text-file (car command-line-args-left)
            command-line-args-left (cdr command-line-args-left)))
-  (unidata-setup-list unidata-text-file)
   (let ((coding-system-for-write 'utf-8-unix)
-       (charprop-file "charprop.el"))
+       (charprop-file "charprop.el")
+       (unidata-dir data-dir))
+    (unidata-setup-list unidata-text-file)
     (with-temp-file charprop-file
       (insert ";; Automatically generated by unidata-gen.el.\n")
       (dolist (elt unidata-prop-alist)
@@ -1227,6 +1272,7 @@
               (file (unidata-prop-file prop))
               (docstring (unidata-prop-docstring prop))
               (describer (unidata-prop-describer prop))
+              (val-list (unidata-prop-val-list prop))
               table)
          ;; Filename in this comment line is extracted by sed in
          ;; Makefile.
@@ -1235,15 +1281,15 @@
                          prop file docstring))
          (with-temp-file file
            (message "Generating %s..." file)
-           (setq table (funcall generator prop))
+           (setq table (funcall generator prop val-list))
            (when describer
              (unless (subrp (symbol-function describer))
                (byte-compile describer)
                (setq describer (symbol-function describer)))
              (set-char-table-extra-slot table 3 describer))
            (insert ";; Copyright (C) 1991-2009 Unicode, Inc.
-;; This file was generated from the Unicode data file at
-;; http://www.unicode.org/Public/UNIDATA/UnicodeData.txt.
+;; This file was generated from the Unicode data files at
+;; http://www.unicode.org/Public/UNIDATA/.
 ;; See lisp/international/README for the copyright and permission notice.\n"
                    (format "(define-char-code-property '%S %S %S)\n"
                            prop table docstring)
@@ -1251,7 +1297,8 @@
                    ";; coding: utf-8\n"
                    ";; no-byte-compile: t\n"
                    ";; End:\n\n"
-                   (format ";; %s ends here\n" file)))))
+                   (format ";; %s ends here\n" file))
+           (message "Generating %s...done" file))))
       (message "Writing %s..." charprop-file)
       (insert ";; Local Variables:\n"
              ";; coding: utf-8\n"

=== modified file 'lisp/ChangeLog'
--- lisp/ChangeLog      2011-05-18 03:42:33 +0000
+++ lisp/ChangeLog      2011-06-17 07:33:43 +0000
@@ -1,3 +1,28 @@
+2011-06-17  Kenichi Handa  <address@hidden>
+
+       * international/mule-cmds.el (char-code-property-alist): Moved to
+       to src/chartab.c.
+       (get-char-code-property): Call get-unicode-property-internal where
+       necessary.
+
+       * international/charprop.el:
+       * international/uni-bidi.el:
+       * international/uni-category.el:
+       * international/uni-combining.el:
+       * international/uni-comment.el:
+       * international/uni-decimal.el:
+       * international/uni-decomposition.el:
+       * international/uni-digit.el:
+       * international/uni-lowercase.el:
+       * international/uni-mirrored.el:
+       * international/uni-name.el:
+       * international/uni-numeric.el:
+       * international/uni-old-name.el:
+       * international/uni-titlecase.el:
+       * international/uni-uppercase.el: Regenerate.
+
+       * international/uni-mirroring.el: New file.
+
 2011-05-18  Glenn Morris  <address@hidden>
 
        * emacs-lisp/authors.el (authors-fixed-entries): Remove fakemail.c

=== modified file 'lisp/international/charprop.el'
--- lisp/international/charprop.el      2010-06-09 15:46:41 +0000
+++ lisp/international/charprop.el      2011-06-17 07:07:51 +0000
@@ -1,8 +1,4 @@
-;; Copyright (C) 1991-2010 Unicode, Inc.
-;; This file was generated from the Unicode data file at
-;; http://www.unicode.org/Public/UNIDATA/UnicodeData.txt.
-;; See lisp/international/README for the copyright and permission notice.
-
+;; Automatically generated by unidata-gen.el.
 ;; FILE: uni-name.el
 (define-char-code-property 'name "uni-name.el"
   "Unicode character name.
@@ -45,7 +41,7 @@
 ;; FILE: uni-mirrored.el
 (define-char-code-property 'mirrored "uni-mirrored.el"
   "Unicode bidi mirrored flag.
-Property value is a symbol `Y' or `N'.")
+Property value is a symbol `Y' or `N'.  See also the property `mirroring'.")
 ;; FILE: uni-old-name.el
 (define-char-code-property 'old-name "uni-old-name.el"
   "Unicode old names as published in Unicode 1.0.
@@ -66,6 +62,11 @@
 (define-char-code-property 'titlecase "uni-titlecase.el"
   "Unicode simple titlecase mapping.
 Property value is a character.")
+;; FILE: uni-mirroring.el
+(define-char-code-property 'mirroring "uni-mirroring.el"
+  "Unicode bidi-mirroring characters.
+Property value is a character that has the corresponding mirroring image,
+or nil for non-mirrored character.")
 ;; Local Variables:
 ;; coding: utf-8
 ;; no-byte-compile: t

=== modified file 'lisp/international/mule-cmds.el'
--- lisp/international/mule-cmds.el     2011-02-28 01:07:29 +0000
+++ lisp/international/mule-cmds.el     2011-06-04 04:33:19 +0000
@@ -2709,16 +2709,6 @@
 
 ;;; Character property
 
-;; Each element has the form (PROP . TABLE).
-;; PROP is a symbol representing a character property.
-;; TABLE is a char-table containing the property value for each character.
-;; TABLE may be a name of file to load to build a char-table.
-;; Don't modify this variable directly but use `define-char-code-property'.
-
-(defvar char-code-property-alist nil
-  "Alist of character property name vs char-table containing property values.
-Internal use only.")
-
 (put 'char-code-property-table 'char-table-extra-slots 5)
 
 (defun define-char-code-property (name table &optional docstring)
@@ -2776,10 +2766,11 @@
          (if (stringp (cdr slot))
              (load (cdr slot) nil t))
          (setq table (cdr slot)
-               value (aref table char)
                func (char-table-extra-slot table 1))
-         (if (functionp func)
-             (setq value (funcall func char value table)))
+         (cond ((integerp func)
+                (setq value (get-unicode-property-internal char table)))
+               ((functionp func)
+                (setq value (funcall func char (aref table char) table))))
          value)
       (plist-get (aref char-code-property-table char) propname))))
 

=== modified file 'lisp/international/uni-bidi.el'
Binary files lisp/international/uni-bidi.el     2010-09-05 02:06:39 +0000 and 
lisp/international/uni-bidi.el    2011-06-17 07:07:49 +0000 differ
=== modified file 'lisp/international/uni-category.el'
Binary files lisp/international/uni-category.el 2010-09-05 02:06:39 +0000 and 
lisp/international/uni-category.el        2011-06-17 07:07:49 +0000 differ
=== modified file 'lisp/international/uni-combining.el'
Binary files lisp/international/uni-combining.el        2010-09-05 02:06:39 
+0000 and lisp/international/uni-combining.el       2011-06-17 07:07:49 +0000 
differ
=== modified file 'lisp/international/uni-comment.el'
Binary files lisp/international/uni-comment.el  2010-06-09 15:46:41 +0000 and 
lisp/international/uni-comment.el 2011-06-17 07:07:50 +0000 differ
=== modified file 'lisp/international/uni-decimal.el'
Binary files lisp/international/uni-decimal.el  2010-09-05 02:06:39 +0000 and 
lisp/international/uni-decimal.el 2011-06-17 07:07:50 +0000 differ
=== modified file 'lisp/international/uni-decomposition.el'
Binary files lisp/international/uni-decomposition.el    2010-06-09 15:46:41 
+0000 and lisp/international/uni-decomposition.el   2011-06-17 07:07:50 +0000 
differ
=== modified file 'lisp/international/uni-digit.el'
Binary files lisp/international/uni-digit.el    2010-06-09 15:46:41 +0000 and 
lisp/international/uni-digit.el   2011-06-17 07:07:50 +0000 differ
=== modified file 'lisp/international/uni-lowercase.el'
Binary files lisp/international/uni-lowercase.el        2010-06-09 15:46:41 
+0000 and lisp/international/uni-lowercase.el       2011-06-17 07:07:51 +0000 
differ
=== modified file 'lisp/international/uni-mirrored.el'
Binary files lisp/international/uni-mirrored.el 2010-09-05 02:06:39 +0000 and 
lisp/international/uni-mirrored.el        2011-06-17 07:07:50 +0000 differ
=== added file 'lisp/international/uni-mirroring.el'
Binary files lisp/international/uni-mirroring.el        1970-01-01 00:00:00 
+0000 and lisp/international/uni-mirroring.el       2011-06-17 07:07:51 +0000 
differ
=== modified file 'lisp/international/uni-name.el'
Binary files lisp/international/uni-name.el     2010-09-05 02:06:39 +0000 and 
lisp/international/uni-name.el    2011-06-17 07:07:49 +0000 differ
=== modified file 'lisp/international/uni-numeric.el'
Binary files lisp/international/uni-numeric.el  2010-06-09 15:46:41 +0000 and 
lisp/international/uni-numeric.el 2011-06-17 07:07:50 +0000 differ
=== modified file 'lisp/international/uni-old-name.el'
Binary files lisp/international/uni-old-name.el 2010-06-09 15:46:41 +0000 and 
lisp/international/uni-old-name.el        2011-06-17 07:07:50 +0000 differ
=== modified file 'lisp/international/uni-titlecase.el'
Binary files lisp/international/uni-titlecase.el        2010-06-09 15:46:41 
+0000 and lisp/international/uni-titlecase.el       2011-06-17 07:07:51 +0000 
differ
=== modified file 'lisp/international/uni-uppercase.el'
Binary files lisp/international/uni-uppercase.el        2010-06-09 15:46:41 
+0000 and lisp/international/uni-uppercase.el       2011-06-17 07:07:50 +0000 
differ
=== modified file 'src/ChangeLog'
--- src/ChangeLog       2011-05-18 03:00:08 +0000
+++ src/ChangeLog       2011-06-17 07:32:14 +0000
@@ -1,3 +1,18 @@
+2011-06-17  Kenichi Handa  <address@hidden>
+
+       * chartab.c (sub_char_table_set): Delete it.
+       (char_table_get_bottom): New function.
+       (char_table_set): Use char_table_get_bottom.
+       (UNIPROP_COMPRESSED_FORM_P): New macro.
+       (uniprop_table_uncompress_simple)
+       (uniprop_table_uncompress_run_length): New functions.
+       (uniprop_uncompress, uniprop_uncompressor_count): New variables.
+       (uniprop_table, uniprop_table_lookup)
+       (Fget_unicode_property_internal): New functions.
+       (syms_of_chartab): Defsubr
+       Sget_unicode_property_internal. Defvar_lisp
+       char-code-property-alist.
+
 2011-05-18  Christoph Scholtes  <address@hidden>
 
        * menu.c: Include limits.h (fixes the MS-Windows build broken by

=== modified file 'src/chartab.c'
--- src/chartab.c       2011-05-12 07:07:06 +0000
+++ src/chartab.c       2011-06-16 03:54:55 +0000
@@ -330,7 +330,7 @@
   return val;
 }
 
-
+#if 0
 static void
 sub_char_table_set (Lisp_Object table, int c, Lisp_Object val)
 {
@@ -354,6 +354,29 @@
       sub_char_table_set (sub, c, val);
     }
 }
+#endif
+
+static Lisp_Object
+char_table_get_bottom (struct Lisp_Char_Table *tbl, int c)
+{
+  Lisp_Object *sub = tbl->contents + CHARTAB_IDX (c, 0, 0);
+  int depth;
+
+  if (! SUB_CHAR_TABLE_P (*sub))
+    *sub = make_sub_char_table (1, c & ~(chartab_chars[0] - 1), *sub);
+  for (depth = 1; depth < 3; depth++)
+    {
+      int min_char = XINT (XSUB_CHAR_TABLE (*sub)->min_char);
+      int idx = CHARTAB_IDX (c, depth, min_char);
+
+      sub = XSUB_CHAR_TABLE (*sub)->contents + idx;
+      if (! SUB_CHAR_TABLE_P (*sub))
+       *sub = make_sub_char_table (depth + 1, c & ~(chartab_chars[depth] - 1),
+                                   *sub);
+    }
+  return *sub;
+}
+
 
 Lisp_Object
 char_table_set (Lisp_Object table, int c, Lisp_Object val)
@@ -367,18 +390,11 @@
     }
   else
     {
-      int i = CHARTAB_IDX (c, 0, 0);
-      Lisp_Object sub;
+      Lisp_Object sub_table = char_table_get_bottom (tbl, c);
 
-      sub = tbl->contents[i];
-      if (! SUB_CHAR_TABLE_P (sub))
-       {
-         sub = make_sub_char_table (1, i * chartab_chars[0], sub);
-         tbl->contents[i] = sub;
-       }
-      sub_char_table_set (sub, c, val);
+      XSUB_CHAR_TABLE (sub_table)->contents[c & (chartab_chars[2] - 1)] = val;
       if (ASCII_CHAR_P (c))
-       tbl->ascii = char_table_ascii (table);
+       tbl->ascii = sub_table;
     }
   return val;
 }
@@ -984,6 +1000,213 @@
 }
 
 
+/* Unicode character property
+
+   This section provides a convenient and efficient way to get a
+   Unicode character property from C code (from Lisp, you can use
+   get-char-code-property).
+
+   The typical usage is to get a char-table for a specific property at
+   a proper initialization time as this:
+
+       Lisp_Object bidi_class_table = uniprop_table (intern ("bidi-class"));
+
+   and get a property value for character CH as this:
+
+       Lisp_Object bidi_class = uniprop_table_lookup (CH, bidi_class_table);
+
+   In this case, what you actually get is an index number to the
+   vector of property values (symbols nil, L, R, etc).  See the
+   comment of uniprop_table_lookup for the detail.  */
+
+/* Nonzero iff OBJ is a string representing Unicode character
+   properties of 128 succeeding characters (the bottom level of a
+   char-table) by a compressed format.  We are sure that no property
+   value has a string starting from NULL character.  */
+
+#define UNIPROP_COMPRESSED_FORM_P(OBJ) (STRINGP (OBJ) && (SREF (OBJ, 0) == 0))
+
+
+/* The first element of uniprop_uncompress (which see).
+   This decodes the compressed data of "SIMPLE TABLE" (see the comment
+   in admin/unidata/unidata-gen.el.  */
+
+static Lisp_Object
+uniprop_table_uncompress_simple (Lisp_Object str,
+                                struct Lisp_Sub_Char_Table *subtbl)
+{
+  const unsigned char *p, *pend;
+  int i, idx;
+
+  p = SDATA (str) + 1, pend = p + SBYTES (str);
+  idx = STRING_CHAR_ADVANCE (p);
+  for (i = 0; i < idx; i++)
+    subtbl->contents[idx] = Qnil;
+  while (p < pend && idx < chartab_chars[2])
+    {
+      int v = STRING_CHAR_ADVANCE (p);
+      subtbl->contents[idx++] = v > 0 ? make_number (v) : Qnil;
+    }
+  while (idx < chartab_chars[2])
+    subtbl->contents[idx++] = Qnil;
+  return Qnil;
+}
+
+/* The second element of uniprop_uncompress (which see).
+   This decodes the compressed data of "RUN-LENGTH TABLE" (see the
+   comment in admin/unidata/unidata-gen.el.  */
+
+static Lisp_Object
+uniprop_table_uncompress_run_length (Lisp_Object str,
+                                    struct Lisp_Sub_Char_Table *subtbl)
+{
+  const unsigned char *p, *pend;
+  int idx;
+
+  p = SDATA (str) + 1, pend = p + SBYTES (str);
+  for (idx = 0; p < pend; )
+    {
+      int v = STRING_CHAR_ADVANCE (p);
+      int count = 1;
+      int len;
+      
+      if (p < pend)
+       {
+         count = STRING_CHAR_AND_LENGTH (p, len);
+         if (count < 128)
+           count = 1;
+         else
+           {
+             count -= 128;
+             p += len;
+           }
+       }
+      while (count-- > 0)
+       subtbl->contents[idx++] = make_number (v);
+    }
+  return Qnil;
+}
+
+#if 0
+/* It seems that we don't need this function because C code won't need
+   to get a property that is compressed in this form.  */
+
+/* The third element of uniprop_uncompress (which see).
+   This decodes the compressed data of "WORD-LIST TABLE" (see the
+   comment in admin/unidata/unidata-gen.el.  */
+
+static Lisp_Object
+uniprop_table_uncompress_word_list (Lisp_Object str,
+                                   struct Lisp_Sub_Char_Table *subtbl)
+{
+  return Qnil;
+}
+#endif
+
+/* Array of functions that decode the compressed property values for
+   consecutive 128 characters in STR, and store each value in the
+   sub-chartable SUBTBL.  */
+
+static Lisp_Object (*uniprop_uncompress []) (Lisp_Object,
+                                            struct Lisp_Sub_Char_Table *)
+  = { uniprop_table_uncompress_simple,
+      uniprop_table_uncompress_run_length};
+
+/* How many elements uniprop_uncompress has.  */
+
+static int uniprop_uncompressor_count =
+  (sizeof uniprop_uncompress) / (sizeof uniprop_uncompress[0]);
+
+/* Return a char-table for Unicode character property PROPNAME.  This
+   function may load a Lisp file and thus may cause
+   garbage-collection.  */
+
+Lisp_Object
+uniprop_table (Lisp_Object propname)
+{
+  Lisp_Object val, table;
+
+  val = Fassq (propname, Vchar_code_property_alist);
+  if (! CONSP (val))
+    return Qnil;
+  table = XCDR (val);
+  if (STRINGP (table))
+    {
+      Lisp_Object result = Fload (AREF (table, 0), Qt, Qt, Qt, Qt);
+      if (NILP (result))
+       return Qnil;
+      table = XCDR (val);
+    }
+  if (NILP (table))
+    return Qnil;
+  val = XCHAR_TABLE (table)->extras[1];
+  if (INTEGERP (val)
+      && XINT (val) >= 0 && XINT (val) < uniprop_uncompressor_count)
+    return table;
+  return Qnil;
+}
+
+/* Return a Unicode character property of character C stored in TABLE.
+   TABLE must be what the function uniprop_table returns.
+
+   If XCHAR_TABLE (TABLE)->extras[4] is a vector, the returned value
+   is an index number to that vector which contains the actual
+   property value.  */
+
+Lisp_Object
+uniprop_table_lookup (int c, Lisp_Object table)
+{
+  struct Lisp_Char_Table *tbl = XCHAR_TABLE (table);
+  struct Lisp_Sub_Char_Table *subtbl;
+  Lisp_Object sub_table, val;
+
+  if (ASCII_CHAR_P (c))
+    {
+      if (SUB_CHAR_TABLE_P (tbl->ascii))
+       return XSUB_CHAR_TABLE (tbl->ascii)->contents[c];
+      /* Always unfold the bottom sub-table for ASCII chars.  */
+      tbl->ascii = sub_table = char_table_get_bottom (tbl, c);
+      subtbl = XSUB_CHAR_TABLE (sub_table);
+      val = subtbl->contents[c];
+      if (UNIPROP_COMPRESSED_FORM_P (val))
+       uniprop_uncompress[XINT (tbl->extras[1])] (val, subtbl);
+    }
+  else
+    {
+      val = char_table_ref (table, c);
+      if (! UNIPROP_COMPRESSED_FORM_P (val))
+       return val;
+      sub_table = char_table_get_bottom (tbl, c);
+      subtbl = XSUB_CHAR_TABLE (sub_table);
+      uniprop_uncompress[XINT (tbl->extras[1])] (val, subtbl);
+    }
+  return subtbl->contents[c - XINT (subtbl->min_char)];
+}
+
+DEFUN ("get-unicode-property-internal", Fget_unicode_property_internal,
+       Sget_unicode_property_internal, 2, 2, 0,
+       doc: /* Get Unicode character property of CH stored in TABLE.
+Internal use only.  */)
+  (Lisp_Object ch, Lisp_Object table)
+{
+  Lisp_Object val;
+
+  CHECK_CHARACTER (ch);
+  CHECK_CHAR_TABLE (table);
+  val = uniprop_table_lookup (XINT (ch), table);
+  if (INTEGERP (val)
+      && VECTORP (XCHAR_TABLE (table)->extras[4]))
+    {
+      Lisp_Object val_vec = XCHAR_TABLE (table)->extras[4];
+      if (XINT (val) > 0 && XINT (val) < ASIZE (val_vec))
+       val = AREF (val_vec, XINT (val) - 1);
+      else
+       val = Qnil;
+    }
+  return val;
+}
+
+
 void
 syms_of_chartab (void)
 {
@@ -998,4 +1221,17 @@
   defsubr (&Sset_char_table_default);
   defsubr (&Soptimize_char_table);
   defsubr (&Smap_char_table);
+  defsubr (&Sget_unicode_property_internal);
+
+  /* Each element has the form (PROP . TABLE).
+     PROP is a symbol representing a character property.
+     TABLE is a char-table containing the property value for each character.
+     TABLE may be a name of file to load to build a char-table.
+     This variable should be modified only through
+     `define-char-code-property'. */
+
+  DEFVAR_LISP ("char-code-property-alist", Vchar_code_property_alist,
+              doc: /* Alist of character property name vs char-table 
containing property values.
+Internal use only.  */);
+  Vchar_code_property_alist = Qnil;
 }




reply via email to

[Prev in Thread] Current Thread [Next in Thread]