emacs-diffs
[Top][All Lists]
Advanced

[Date Prev][Date Next][Thread Prev][Thread Next][Date Index][Thread Index]

[Emacs-diffs] Changes to emacs/lisp/international/utf-8.el


From: Kenichi Handa
Subject: [Emacs-diffs] Changes to emacs/lisp/international/utf-8.el
Date: Mon, 30 Sep 2002 02:35:14 -0400

Index: emacs/lisp/international/utf-8.el
diff -c emacs/lisp/international/utf-8.el:1.18 
emacs/lisp/international/utf-8.el:1.19
*** emacs/lisp/international/utf-8.el:1.18      Wed Sep 11 16:57:50 2002
--- emacs/lisp/international/utf-8.el   Mon Sep 30 02:35:13 2002
***************
*** 46,57 ****
  ;; Fixme: note that reading and writing invalid utf-8 may not be
  ;; idempotent -- to represent the bytes to fix that needs a new charset.
  ;;
! ;; Characters from other character sets can be encoded with
! ;; mule-utf-8 by populating the table `ucs-mule-to-mule-unicode' and
! ;; registering the translation with `register-char-codings'.  Hash
! ;; tables `utf-8-subst-table' and `utf-8-subst-rev-table' are used to
! ;; support encoding and decoding of about a quarter of the CJK space
! ;; between U+3400 and U+DFFF.
  
  ;; UTF-8 is defined in RFC 2279.  A sketch of the encoding is:
  
--- 46,58 ----
  ;; Fixme: note that reading and writing invalid utf-8 may not be
  ;; idempotent -- to represent the bytes to fix that needs a new charset.
  ;;
! ;; Characters from other character sets can be encoded with mule-utf-8
! ;; by populating the translation-table
! ;; `utf-translation-table-for-encode' and registering the translation
! ;; with `register-char-codings'.  Hash tables
! ;; `utf-subst-table-for-decode' and `utf-subst-table-for-encode' are
! ;; used to support encoding and decoding of about a quarter of the CJK
! ;; space between U+3400 and U+DFFF.
  
  ;; UTF-8 is defined in RFC 2279.  A sketch of the encoding is:
  
***************
*** 64,97 ****
  
  ;;; Code:
  
! (defvar ucs-mule-to-mule-unicode (make-translation-table)
!   "Translation table for encoding to `mule-utf-8'.")
! (define-translation-table 'ucs-mule-to-mule-unicode
!   ucs-mule-to-mule-unicode)
! 
! (defvar utf-8-subst-table (make-hash-table :test 'eq))
! (defvar utf-8-subst-rev-table (make-hash-table :test 'eq))
! (define-translation-hash-table 'utf-8-subst-table utf-8-subst-table)
! (define-translation-hash-table 'utf-8-subst-rev-table utf-8-subst-rev-table)
! 
! (defvar utf-8-translation-table-for-decode (make-translation-table)
!   "Translation table applied after decoding utf-8 to mule-unicode.
! This is only actually applied to characters which would normally be
! decoded into mule-unicode-0100-24ff.")
! (define-translation-table 'utf-8-translation-table-for-decode
!   utf-8-translation-table-for-decode)
  
  ;; Map Cyrillic and Greek to iso-8859 charsets, which take half the
  ;; space of mule-unicode.  For Latin scripts this isn't very
  ;; important.  Hebrew and Arabic might go here too when there's proper
  ;; support for them.
! (defvar utf-8-fragmentation-table (make-translation-table)
!   "Char table normally mapping non-Latin mule-unicode-... characters to 
iso8859.
! Used as the value of `utf-8-translation-table-for-decode' in
! `utf-8-fragment-on-decoding' mode.")
  (mapc
   (lambda (pair)
!    (aset utf-8-fragmentation-table (car pair) (cdr pair)))
   '((?$,1&d(B . ?,F4(B) (?$,1&e(B . ?,F5(B) (?$,1&f(B . ?,F6(B) 
(?$,1&h(B . ?,F8(B) (?$,1&i(B . ?,F9(B)
     (?$,1&j(B . ?,F:(B) (?$,1&l(B . ?,F<(B) (?$,1&n(B . ?,F>(B) 
(?$,1&o(B . ?,F?(B) (?$,1&p(B . ?,address@hidden(B)
     (?$,1&q(B . ?,FA(B) (?$,1&r(B . ?,FB(B) (?$,1&s(B . ?,FC(B) 
(?$,1&t(B . ?,FD(B) (?$,1&u(B . ?,FE(B)
--- 65,122 ----
  
  ;;; Code:
  
! (defvar ucs-mule-to-mule-unicode (make-char-table 'translation-table nil)
!   "Char table mapping characters to latin-iso8859-1 or mule-unicode-*.
! 
! If `unify-8859-on-encoding-mode' is non-nil, this table populates the
! translation-table named `utf-translation-table-for-encode'.")
! 
! (define-translation-table 'utf-translation-table-for-encode)
! 
  
  ;; Map Cyrillic and Greek to iso-8859 charsets, which take half the
  ;; space of mule-unicode.  For Latin scripts this isn't very
  ;; important.  Hebrew and Arabic might go here too when there's proper
  ;; support for them.
! 
! (defvar utf-fragmentation-table (make-char-table 'translation-table nil)
!   "Char-table normally mapping non-Latin mule-unicode-* chars to iso-8859-*.
! 
! If `utf-fragment-on-decoding' is non-nil, this table populates the
! translation-table named `utf-translation-table-for-decode'")
! 
! (defvar utf-defragmentation-table (make-char-table 'translation-table nil)
!   "Char-table for reverse mapping of `utf-fragmentation-table'.
! 
! If `utf-fragment-on-decoding' is non-nil and
! `unify-8859-on-encoding-mode' is nil, this table populates the
! translation-table named `utf-translation-table-for-encode'")
! 
! (define-translation-table 'utf-translation-table-for-decode)
! 
! 
! (defvar ucs-mule-cjk-to-unicode (make-hash-table :test 'eq)
!   "Hash table mapping Emacs CJK character sets to Unicode code points.
! 
! If `utf-translate-cjk' is non-nil, this table populates the
! translation-hash-table named `utf-subst-table-for-encode'.")
! 
! (define-translation-hash-table 'utf-subst-table-for-encode 
!   (make-hash-table :test 'eq))
! 
! (defvar ucs-unicode-to-mule-cjk (make-hash-table :test 'eq)
!   "Hash table mapping Unicode code points to Emacs CJK character sets.
! 
! If `utf-translate-cjk' is non-nil, this table populates the
! translation-hash-table named `utf-subst-table-for-decode'.")
! 
! (define-translation-hash-table 'utf-subst-table-for-decode
!   (make-hash-table :test 'eq))
! 
  (mapc
   (lambda (pair)
!    (aset utf-fragmentation-table (car pair) (cdr pair))
!    (aset utf-defragmentation-table (cdr pair) (car pair)))
   '((?$,1&d(B . ?,F4(B) (?$,1&e(B . ?,F5(B) (?$,1&f(B . ?,F6(B) 
(?$,1&h(B . ?,F8(B) (?$,1&i(B . ?,F9(B)
     (?$,1&j(B . ?,F:(B) (?$,1&l(B . ?,F<(B) (?$,1&n(B . ?,F>(B) 
(?$,1&o(B . ?,F?(B) (?$,1&p(B . ?,address@hidden(B)
     (?$,1&q(B . ?,FA(B) (?$,1&r(B . ?,FB(B) (?$,1&s(B . ?,FC(B) 
(?$,1&t(B . ?,FD(B) (?$,1&u(B . ?,FE(B)
***************
*** 128,135 ****
     (?$,1(w(B . ?,Lw(B) (?$,1(x(B . ?,Lx(B) (?$,1(y(B . ?,Ly(B) 
(?$,1(z(B . ?,Lz(B) (?$,1({(B . ?,L{(B)
     (?$,1(|(B . ?,L|(B) (?$,1(~(B . ?,L~(B) (?$,1((B . ?,L(B)))
  
! (defcustom utf-8-fragment-on-decoding nil
!   "Whether or not to decode some scripts in UTF-8 text into iso8859 charsets.
  Setting this means that the relevant Cyrillic and Greek characters are
  decoded into the iso8859 charsets rather than into
  mule-unicode-0100-24ff.  The iso8859 charsets take half as much space
--- 153,161 ----
     (?$,1(w(B . ?,Lw(B) (?$,1(x(B . ?,Lx(B) (?$,1(y(B . ?,Ly(B) 
(?$,1(z(B . ?,Lz(B) (?$,1({(B . ?,L{(B)
     (?$,1(|(B . ?,L|(B) (?$,1(~(B . ?,L~(B) (?$,1((B . ?,L(B)))
  
! 
! (defcustom utf-fragment-on-decoding nil
!   "Whether or not to decode some chars in UTF-8/16 text into iso8859 charsets.
  Setting this means that the relevant Cyrillic and Greek characters are
  decoded into the iso8859 charsets rather than into
  mule-unicode-0100-24ff.  The iso8859 charsets take half as much space
***************
*** 140,179 ****
  
  Setting this variable outside customize has no effect."
    :set (lambda (s v)
!        (setq utf-8-translation-table-for-decode
!              (if v
!                  utf-8-fragmentation-table
!                (make-translation-table)))
!        (define-translation-table 'utf-8-translation-table-for-decode
!          utf-8-translation-table-for-decode)
         (set-default s v))
    :version "21.4"
    :type 'boolean
    :group 'mule)
  
! (defcustom utf-8-translate-cjk nil
!   "Whether the `mule-utf-8' coding system should encode many CJK characters.
  
! Enabling this loads tables which enable the coding system to encode
! characters in the charsets `korean-ksc5601', `chinese-gb2312' and
  `japanese-jisx0208', and to decode the corresponding unicodes into
  such characters.  This works by loading the library `utf-8-subst'; see
  its commentary.  The tables are fairly large (about 33000 entries), so this
  option is not the default."
    :link '(emacs-commentary-link "utf-8-subst")
    :set (lambda (s v)
!        (when v
!          (require 'utf-8-subst)
!          (let ((table (make-char-table 'translation-table)))
!            (coding-system-put 'mule-utf-8 'safe-charsets
!                               (append (coding-system-get 'mule-utf-8
!                                                          'safe-charsets)
!                                       '(korean-ksc5601 chinese-gb2312
!                                                        japanese-jisx0208)))
!            (maphash (lambda (k v)
!                       (aset table k v))
!                     utf-8-subst-rev-table)
!            (register-char-codings 'mule-utf-8 table)))
         (set-default s v))
    :version "21.4"
    :type 'boolean
--- 166,246 ----
  
  Setting this variable outside customize has no effect."
    :set (lambda (s v)
!        (if v
!            (progn
!              (define-translation-table 'utf-translation-table-for-decode
!                utf-fragmentation-table)
!              ;; Even if unify-8859-on-encoding-mode is off, make
!              ;; mule-utf-* encode characters in
!              ;; utf-fragmentation-table.
!              (unless (eq (get 'utf-translation-table-for-encode
!                               'translation-table)
!                          ucs-mule-to-mule-unicode)
!                (define-translation-table 'utf-translation-table-for-encode
!                  utf-defragmentation-table)
!                (dolist (coding '(mule-utf-8 mule-utf-16-be mule-utf-16-le))
!                  (register-char-codings coding utf-defragmentation-table))))
!          (define-translation-table 'utf-translation-table-for-decode)
!          ;; When unify-8859-on-encoding-mode is off, be sure to make
!          ;; mule-utf-* disabled for characters in
!          ;; utf-fragmentation-table.
!          (unless (eq (get 'utf-translation-table-for-encode
!                           'translation-table)
!                      ucs-mule-to-mule-unicode)
!            (define-translation-table 'utf-translation-table-for-encode)
!            (map-char-table
!             (lambda (key val)
!               (if (and (>= key 128) val)
!                   (aset char-coding-system-table key
!                         (delq 'mule-utf-8
!                               (delq 'mule-utf-16-le
!                                     (delq 'mule-utf-16-be
!                                           (aref char-coding-system-table
!                                                 key)))))))
!             utf-defragmentation-table)))
         (set-default s v))
    :version "21.4"
    :type 'boolean
    :group 'mule)
  
! (defcustom utf-translate-cjk nil
!   "Whether the UTF based coding systems should decode/encode CJK characters.
  
! Enabling this loads tables which enable the coding systems:
!     mule-utf-8, mule-utf-16-le, mule-utf-16-be
! to encode characters in the charsets `korean-ksc5601', `chinese-gb2312' and
  `japanese-jisx0208', and to decode the corresponding unicodes into
  such characters.  This works by loading the library `utf-8-subst'; see
  its commentary.  The tables are fairly large (about 33000 entries), so this
  option is not the default."
    :link '(emacs-commentary-link "utf-8-subst")
    :set (lambda (s v)
!        (if v
!            (progn
!              (require 'utf-8-subst)
!              (let ((table (make-char-table 'translation-table)))
!                (maphash (lambda (k v)
!                           (aset table k t))
!                         ucs-mule-cjk-to-unicode)
!                (register-char-codings 'mule-utf-8 table)
!                (register-char-codings 'mule-utf-16-le table)
!                (register-char-codings 'mule-utf-16-be table))
!              (define-translation-hash-table 'utf-subst-table-for-decode
!                ucs-unicode-to-mule-cjk)
!              (define-translation-hash-table 'utf-subst-table-for-encode
!                ucs-mule-cjk-to-unicode))
!          (map-char-table
!           (lambda (k v)
!             (if (gethash k ucs-mule-cjk-to-unicode)
!                 (aset char-coding-system-table k
!                       (delq 'mule-utf-8
!                             (delq 'mule-utf-16-le
!                                   (delq 'mule-utf-16-be v))))))
!           char-coding-system-table)
!          (define-translation-hash-table 'utf-subst-table-for-decode
!            (make-hash-table :test 'eq))
!          (define-translation-hash-table 'utf-subst-table-for-encode
!            (make-hash-table :test 'eq)))
         (set-default s v))
    :version "21.4"
    :type 'boolean
***************
*** 263,269 ****
                         (r1 %= 96)
                         (r1 += (r2 + 32))
                         (translate-character
!                         utf-8-translation-table-for-decode r0 r1)
                         (write-multibyte-character r0 r1))))))))
  
            ;; 3byte encoding
--- 330,336 ----
                         (r1 %= 96)
                         (r1 += (r2 + 32))
                         (translate-character
!                         utf-translation-table-for-decode r0 r1)
                         (write-multibyte-character r0 r1))))))))
  
            ;; 3byte encoding
***************
*** 308,321 ****
                         (r1 = (r7 + 32))
                         (r1 += ((r3 + 32) << 7))
                         (translate-character
!                         utf-8-translation-table-for-decode r0 r1)
                         (write-multibyte-character r0 r1))
                    
                      ;; mule-unicode-2500-33ff
                      ;; Fixme: Perhaps allow translation via
!                     ;; utf-8-subst-table for #x2e80 up, so that we use
!                     ;; consistent charsets for all of CJK.  Would need
!                     ;; corresponding change to encoding tables.
                      (if (r3 < #x3400)
                          ((r0 = ,(charset-id 'mule-unicode-2500-33ff))
                           (r3 -= #x2500)
--- 375,389 ----
                         (r1 = (r7 + 32))
                         (r1 += ((r3 + 32) << 7))
                         (translate-character
!                         utf-translation-table-for-decode r0 r1)
                         (write-multibyte-character r0 r1))
                    
                      ;; mule-unicode-2500-33ff
                      ;; Fixme: Perhaps allow translation via
!                     ;; utf-subst-table-for-decode for #x2e80 up, so
!                     ;; that we use consistent charsets for all of
!                     ;; CJK.  Would need corresponding change to
!                     ;; encoding tables.
                      (if (r3 < #x3400)
                          ((r0 = ,(charset-id 'mule-unicode-2500-33ff))
                           (r3 -= #x2500)
***************
*** 329,335 ****
                        ;; them as eight-bit-{control|graphic}.
                        (if (r3 < #xd800)
                            ((r4 = r3)  ; don't zap r3
!                            (lookup-integer utf-8-subst-table r4 r5)
                             (if r7
                                 ;; got a translation
                                 ((write-multibyte-character r4 r5)
--- 397,403 ----
                        ;; them as eight-bit-{control|graphic}.
                        (if (r3 < #xd800)
                            ((r4 = r3)  ; don't zap r3
!                            (lookup-integer utf-subst-table-for-decode r4 r5)
                             (if r7
                                 ;; got a translation
                                 ((write-multibyte-character r4 r5)
***************
*** 370,376 ****
              (if (r0 < #xfe)
                  ;; 4byte encoding
                  ;; keep those bytes as eight-bit-{control|graphic}
!                 ;; Fixme: allow lookup in utf-8-subst-table.
                  ((read r1 r2 r3)
                   ;; r0 > #xf0, thus eight-bit-graphic
                   (write-multibyte-character r6 r0)
--- 438,444 ----
              (if (r0 < #xfe)
                  ;; 4byte encoding
                  ;; keep those bytes as eight-bit-{control|graphic}
!                 ;; Fixme: allow lookup in utf-subst-table-for-decode.
                  ((read r1 r2 r3)
                   ;; r0 > #xf0, thus eight-bit-graphic
                   (write-multibyte-character r6 r0)
***************
*** 409,416 ****
  
    "CCL program to decode UTF-8.
  Basic decoding is done into the charsets ascii, latin-iso8859-1 and
! mule-unicode-*, but see also `utf-8-translation-table-for-decode' and
! `utf-8-subst-table'.
  Encodings of un-representable Unicode characters are decoded asis into
  eight-bit-control and eight-bit-graphic characters.")
  
--- 477,484 ----
  
    "CCL program to decode UTF-8.
  Basic decoding is done into the charsets ascii, latin-iso8859-1 and
! mule-unicode-*, but see also `utf-fragmentation-table' and
! `ucs-mule-cjk-to-unicode'.
  Encodings of un-representable Unicode characters are decoded asis into
  eight-bit-control and eight-bit-graphic characters.")
  
***************
*** 421,427 ****
        (if (r5 < 0)
          ((r1 = -1)
           (read-multibyte-character r0 r1)
!          (translate-character ucs-mule-to-mule-unicode r0 r1))
        (;; We have already done read-multibyte-character.
         (r0 = r5)
         (r1 = r6)
--- 489,495 ----
        (if (r5 < 0)
          ((r1 = -1)
           (read-multibyte-character r0 r1)
!          (translate-character utf-translation-table-for-encode r0 r1))
        (;; We have already done read-multibyte-character.
         (r0 = r5)
         (r1 = r6)
***************
*** 516,522 ****
                                ((write #xc2)
                                 (write r1)))))))
  
!                   ((lookup-character utf-8-subst-rev-table r0 r1)
                     (if r7             ; lookup succeeded
                         ((r1 = (((r0 & #xf000) >> 12) | #xe0))
                          (r2 = ((r0 & #x3f) | #x80))
--- 584,590 ----
                                ((write #xc2)
                                 (write r1)))))))
  
!                   ((lookup-character utf-subst-table-for-encode r0 r1)
                     (if r7             ; lookup succeeded
                         ((r1 = (((r0 & #xf000) >> 12) | #xe0))
                          (r2 = ((r0 & #x3f) | #x80))
***************
*** 538,547 ****
  
    "CCL program to encode into UTF-8.")
  
- ;; Dummy definition so that the CCL can be checked correctly; the
- ;; actual data are loaded on demand.
- (unless (boundp 'ucs-mule-8859-to-mule-unicode)       ; don't zap it
-   (define-translation-table 'ucs-mule-8859-to-mule-unicode))
  
  (define-ccl-program ccl-untranslated-to-ucs
    `(0
--- 606,611 ----
***************
*** 648,654 ****
  ;; ucs-tables is preloaded
  ;; (defun utf-8-pre-write-conversion (beg end)
  ;;   "Semi-dummy pre-write function effectively to autoload ucs-tables."
! ;;   ;; Ensure translation table is loaded.
  ;;   (require 'ucs-tables)
  ;;   ;; Don't do this again.
  ;;   (coding-system-put 'mule-utf-8 'pre-write-conversion nil)
--- 712,718 ----
  ;; ucs-tables is preloaded
  ;; (defun utf-8-pre-write-conversion (beg end)
  ;;   "Semi-dummy pre-write function effectively to autoload ucs-tables."
! ;;   ;; Ensure translation-table is loaded.
  ;;   (require 'ucs-tables)
  ;;   ;; Don't do this again.
  ;;   (coding-system-put 'mule-utf-8 'pre-write-conversion nil)
***************
*** 657,689 ****
  (make-coding-system
   'mule-utf-8 4 ?u
   "UTF-8 encoding for Emacs-supported Unicode characters.
! The supported Emacs character sets are the following, plus any other
! characters included in the tables `ucs-mule-to-mule-unicode' and
! `utf-8-subst-rev-table':
!  ascii
!  eight-bit-control
!  eight-bit-graphic
!  latin-iso8859-1
!  latin-iso8859-2
!  latin-iso8859-3
!  latin-iso8859-4
!  cyrillic-iso8859-5
!  greek-iso8859-7
!  hebrew-iso8859-8
!  latin-iso8859-9
!  latin-iso8859-14
!  latin-iso8859-15
!  mule-unicode-0100-24ff
!  mule-unicode-2500-33ff
!  mule-unicode-e000-ffff
! 
! Unicode characters out of the ranges U+0000-U+33FF and U+E200-U+FFFF
! may be decoded into korean-ksc5601, chinese-gb2312, japanese-jisx0208
! \(see user option `utf-8-translate-cjk'); otherwise, sequences of
! eight-bit-control and eight-bit-graphic characters are used to
! preserve their byte sequences, and these are composed to display as a
! single character.  Emacs characters that otherwise can't be encoded
! are encoded as U+FFFD."
  
   '(ccl-decode-mule-utf-8 . ccl-encode-mule-utf-8)
   '((safe-charsets
--- 721,741 ----
  (make-coding-system
   'mule-utf-8 4 ?u
   "UTF-8 encoding for Emacs-supported Unicode characters.
! It supports Unicode characters of these ranges:
!     U+0000..U+33FF, U+E000..U+FFFF.
! They correspond to these Emacs character sets:
!     ascii, latin-iso8859-1, mule-unicode-0100-24ff,
!     mule-unicode-2500-33ff, mule-unicode-e000-ffff
! 
! On decoding (e.g. reading a file), Unicode characters not in the above
! ranges are decoded into sequences of eight-bit-control and
! eight-bit-graphic characters to preserve their byte sequences.  The
! byte sequence is preserved on i/o for valid utf-8, but not necessarily
! for invalid utf-8.
! 
! On encoding (e.g. writing a file), Emacs characters not belonging to
! any of the character sets listed above are encoded into the UTF-8 byte
! sequence representing U+FFFD (REPLACEMENT CHARACTER)."
  
   '(ccl-decode-mule-utf-8 . ccl-encode-mule-utf-8)
   '((safe-charsets
***************
*** 691,714 ****
      eight-bit-control
      eight-bit-graphic
      latin-iso8859-1
-     latin-iso8859-15
-     latin-iso8859-14
-     latin-iso8859-9
-     hebrew-iso8859-8
-     greek-iso8859-7
-     cyrillic-iso8859-5
-     latin-iso8859-4
-     latin-iso8859-3
-     latin-iso8859-2
-     vietnamese-viscii-lower
-     vietnamese-viscii-upper
-     thai-tis620
-     ipa
-     ethiopic
-     indian-is13194
-     katakana-jisx0201
-     chinese-sisheng
-     lao
      mule-unicode-0100-24ff
      mule-unicode-2500-33ff
      mule-unicode-e000-ffff)
--- 743,748 ----
***************
*** 716,722 ****
     (coding-category . coding-category-utf-8)
     (valid-codes (0 . 255))
  ;;    (pre-write-conversion . utf-8-pre-write-conversion)
!    (post-read-conversion . utf-8-post-read-conversion)))
  
  (define-coding-system-alias 'utf-8 'mule-utf-8)
  
--- 750,760 ----
     (coding-category . coding-category-utf-8)
     (valid-codes (0 . 255))
  ;;    (pre-write-conversion . utf-8-pre-write-conversion)
!    (post-read-conversion . utf-8-post-read-conversion)
!    (dependency unify-8859-on-encoding-mode
!              unify-8859-on-decoding-mode
!              utf-fragment-on-decoding
!              utf-translate-cjk)))
  
  (define-coding-system-alias 'utf-8 'mule-utf-8)
  




reply via email to

[Prev in Thread] Current Thread [Next in Thread]