emacs-diffs
[Top][All Lists]
Advanced

[Date Prev][Date Next][Thread Prev][Thread Next][Date Index][Thread Index]

[Emacs-diffs] Changes to emacs/lisp/international/utf-8.el


From: Dave Love
Subject: [Emacs-diffs] Changes to emacs/lisp/international/utf-8.el
Date: Wed, 17 Jul 2002 11:04:26 -0400

Index: emacs/lisp/international/utf-8.el
diff -c emacs/lisp/international/utf-8.el:1.13 
emacs/lisp/international/utf-8.el:1.14
*** emacs/lisp/international/utf-8.el:1.13      Sat Apr  6 05:44:36 2002
--- emacs/lisp/international/utf-8.el   Wed Jul 17 11:04:25 2002
***************
*** 1,10 ****
! ;;; utf-8.el --- limited UTF-8 decoding/encoding support -*- coding: 
iso-2022-7bit -*-
  
  ;; Copyright (C) 2001 Electrotechnical Laboratory, JAPAN.
  ;; Licensed to the Free Software Foundation.
! ;; Copyright (C) 2001 Free Software Foundation, Inc.
  
  ;; Author: TAKAHASHI Naoto  <address@hidden>
  ;; Keywords: multilingual, Unicode, UTF-8, i18n
  
  ;; This file is part of GNU Emacs.
--- 1,11 ----
! ;;; utf-8.el --- UTF-8 decoding/encoding support -*- coding: iso-2022-7bit -*-
  
  ;; Copyright (C) 2001 Electrotechnical Laboratory, JAPAN.
  ;; Licensed to the Free Software Foundation.
! ;; Copyright (C) 2001, 2002 Free Software Foundation, Inc.
  
  ;; Author: TAKAHASHI Naoto  <address@hidden>
+ ;; Maintainer: FSF
  ;; Keywords: multilingual, Unicode, UTF-8, i18n
  
  ;; This file is part of GNU Emacs.
***************
*** 39,49 ****
  ;; On decoding, Unicode characters that do not fit into the above
  ;; character sets are handled as `eight-bit-control' or
  ;; `eight-bit-graphic' characters to retain the information about the
! ;; original byte sequence.
  ;;
  ;; Characters from other character sets can be encoded with
  ;; mule-utf-8 by populating the table `ucs-mule-to-mule-unicode' and
! ;; registering the translation with `register-char-codings'.
  
  ;; UTF-8 is defined in RFC 2279.  A sketch of the encoding is:
  
--- 40,57 ----
  ;; On decoding, Unicode characters that do not fit into the above
  ;; character sets are handled as `eight-bit-control' or
  ;; `eight-bit-graphic' characters to retain the information about the
! ;; original byte sequence and text properties record the corresponding
! ;; unicode.
! ;;
! ;; Fixme: note that reading and writing invalid utf-8 may not be
! ;; idempotent -- to represent the bytes to fix that needs a new charset.
  ;;
  ;; Characters from other character sets can be encoded with
  ;; mule-utf-8 by populating the table `ucs-mule-to-mule-unicode' and
! ;; registering the translation with `register-char-codings'.  Hash
! ;; tables `utf-8-subst-table' and `utf-8-subst-rev-table' are used to
! ;; support encoding and decoding of about a quarter of the CJK space
! ;; between U+3400 and U+DFFF.
  
  ;; UTF-8 is defined in RFC 2279.  A sketch of the encoding is:
  
***************
*** 60,66 ****
    "Translation table for encoding to `mule-utf-8'.")
  ;; Could have been done by ucs-tables loaded before.
  (unless (get 'ucs-mule-to-mule-unicode 'translation-table)
!   (define-translation-table 'ucs-mule-to-mule-unicode 
ucs-mule-to-mule-unicode))
  (define-ccl-program ccl-decode-mule-utf-8
    ;;
    ;;        charset         | bytes in utf-8 | bytes in emacs
--- 68,178 ----
    "Translation table for encoding to `mule-utf-8'.")
  ;; Could have been done by ucs-tables loaded before.
  (unless (get 'ucs-mule-to-mule-unicode 'translation-table)
!   (define-translation-table 'ucs-mule-to-mule-unicode
!     ucs-mule-to-mule-unicode))
! 
! (defvar utf-8-subst-table (make-hash-table :test 'eq))
! (defvar utf-8-subst-rev-table (make-hash-table :test 'eq))
! (define-translation-hash-table 'utf-8-subst-table utf-8-subst-table)
! (define-translation-hash-table 'utf-8-subst-rev-table utf-8-subst-rev-table)
! 
! (defvar utf-8-translation-table-for-decode (make-translation-table)
!   "Translation table applied after decoding utf-8 to mule-unicode.
! This is only actually applied to characters which would normally be
! decoded into mule-unicode-0100-24ff.")
! (define-translation-table 'utf-8-translation-table-for-decode
!   utf-8-translation-table-for-decode)
! 
! ;; Map Cyrillic and Greek to iso-8859 charsets, which take half the
! ;; space of mule-unicode.  For Latin scripts this isn't very
! ;; important.  Hebrew and Arabic might go here too when there's proper
! ;; support for them.
! (mapc
!  (lambda (pair)
!    (aset utf-8-translation-table-for-decode (car pair) (cdr pair)))
!  '((?$,1&d(B . ?,F4(B) (?$,1&e(B . ?,F5(B) (?$,1&f(B . ?,F6(B) 
(?$,1&h(B . ?,F8(B) (?$,1&i(B . ?,F9(B)
!    (?$,1&j(B . ?,F:(B) (?$,1&l(B . ?,F<(B) (?$,1&n(B . ?,F>(B) 
(?$,1&o(B . ?,F?(B) (?$,1&p(B . ?,address@hidden(B)
!    (?$,1&q(B . ?,FA(B) (?$,1&r(B . ?,FB(B) (?$,1&s(B . ?,FC(B) 
(?$,1&t(B . ?,FD(B) (?$,1&u(B . ?,FE(B)
!    (?$,1&v(B . ?,FF(B) (?$,1&w(B . ?,FG(B) (?$,1&x(B . ?,FH(B) 
(?$,1&y(B . ?,FI(B) (?$,1&z(B . ?,FJ(B)
!    (?$,1&{(B . ?,FK(B) (?$,1&|(B . ?,FL(B) (?$,1&}(B . ?,FM(B) 
(?$,1&~(B . ?,FN(B) (?$,1&(B . ?,FO(B)
!    (?$,1' (B . ?,FP(B) (?$,1'!(B . ?,FQ(B) (?$,1'#(B . ?,FS(B) 
(?$,1'$(B . ?,FT(B) (?$,1'%(B . ?,FU(B)
!    (?$,1'&(B . ?,FV(B) (?$,1''(B . ?,FW(B) (?$,1'((B . ?,FX(B) 
(?$,1')(B . ?,FY(B) (?$,1'*(B . ?,FZ(B)
!    (?$,1'+(B . ?,F[(B) (?$,1',(B . ?,F\(B) (?$,1'-(B . ?,F](B) 
(?$,1'.(B . ?,F^(B) (?$,1'/(B . ?,F_(B)
!    (?$,1'0(B . ?,F`(B) (?$,1'1(B . ?,Fa(B) (?$,1'2(B . ?,Fb(B) 
(?$,1'3(B . ?,Fc(B) (?$,1'4(B . ?,Fd(B)
!    (?$,1'5(B . ?,Fe(B) (?$,1'6(B . ?,Ff(B) (?$,1'7(B . ?,Fg(B) 
(?$,1'8(B . ?,Fh(B) (?$,1'9(B . ?,Fi(B)
!    (?$,1':(B . ?,Fj(B) (?$,1';(B . ?,Fk(B) (?$,1'<(B . ?,Fl(B) 
(?$,1'=(B . ?,Fm(B) (?$,1'>(B . ?,Fn(B)
!    (?$,1'?(B . ?,Fo(B) (?$,1'@(B . ?,Fp(B) (?$,1'A(B . ?,Fq(B) 
(?$,1'B(B . ?,Fr(B) (?$,1'C(B . ?,Fs(B)
!    (?$,1'D(B . ?,Ft(B) (?$,1'E(B . ?,Fu(B) (?$,1'F(B . ?,Fv(B) 
(?$,1'G(B . ?,Fw(B) (?$,1'H(B . ?,Fx(B)
!    (?$,1'I(B . ?,Fy(B) (?$,1'J(B . ?,Fz(B) (?$,1'K(B . ?,F{(B) 
(?$,1'L(B . ?,F|(B) (?$,1'M(B . ?,F}(B)
!    (?$,1'N(B . ?,F~(B)
! 
!    (?$,1(!(B . ?,L!(B) (?$,1("(B . ?,L"(B) (?$,1(#(B . ?,L#(B) 
(?$,1($(B . ?,L$(B)
!    (?$,1(%(B . ?,L%(B) (?$,1(&(B . ?,L&(B) (?$,1('(B . ?,L'(B) 
(?$,1(((B . ?,L((B) (?$,1()(B . ?,L)(B)
!    (?$,1(*(B . ?,L*(B) (?$,1(+(B . ?,L+(B) (?$,1(,(B . ?,L,(B) 
(?$,1(.(B . ?,L.(B) (?$,1(/(B . ?,L/(B)
!    (?$,1(0(B . ?,L0(B) (?$,1(1(B . ?,L1(B) (?$,1(2(B . ?,L2(B) 
(?$,1(3(B . ?,L3(B) (?$,1(4(B . ?,L4(B)
!    (?$,1(5(B . ?,L5(B) (?$,1(6(B . ?,L6(B) (?$,1(7(B . ?,L7(B) 
(?$,1(8(B . ?,L8(B) (?$,1(9(B . ?,L9(B)
!    (?$,1(:(B . ?,L:(B) (?$,1(;(B . ?,L;(B) (?$,1(<(B . ?,L<(B) 
(?$,1(=(B . ?,L=(B) (?$,1(>(B . ?,L>(B)
!    (?$,1(?(B . ?,L?(B) (?$,1(@(B . ?,address@hidden(B) (?$,1(A(B . 
?,LA(B) (?$,1(B(B . ?,LB(B) (?$,1(C(B . ?,LC(B)
!    (?$,1(D(B . ?,LD(B) (?$,1(E(B . ?,LE(B) (?$,1(F(B . ?,LF(B) 
(?$,1(G(B . ?,LG(B) (?$,1(H(B . ?,LH(B)
!    (?$,1(I(B . ?,LI(B) (?$,1(J(B . ?,LJ(B) (?$,1(K(B . ?,LK(B) 
(?$,1(L(B . ?,LL(B) (?$,1(M(B . ?,LM(B)
!    (?$,1(N(B . ?,LN(B) (?$,1(O(B . ?,LO(B) (?$,1(P(B . ?,LP(B) 
(?$,1(Q(B . ?,LQ(B) (?$,1(R(B . ?,LR(B)
!    (?$,1(S(B . ?,LS(B) (?$,1(T(B . ?,LT(B) (?$,1(U(B . ?,LU(B) 
(?$,1(V(B . ?,LV(B) (?$,1(W(B . ?,LW(B)
!    (?$,1(X(B . ?,LX(B) (?$,1(Y(B . ?,LY(B) (?$,1(Z(B . ?,LZ(B) 
(?$,1([(B . ?,L[(B) (?$,1(\(B . ?,L\(B)
!    (?$,1(](B . ?,L](B) (?$,1(^(B . ?,L^(B) (?$,1(_(B . ?,L_(B) 
(?$,1(`(B . ?,L`(B) (?$,1(a(B . ?,La(B)
!    (?$,1(b(B . ?,Lb(B) (?$,1(c(B . ?,Lc(B) (?$,1(d(B . ?,Ld(B) 
(?$,1(e(B . ?,Le(B) (?$,1(f(B . ?,Lf(B)
!    (?$,1(g(B . ?,Lg(B) (?$,1(h(B . ?,Lh(B) (?$,1(i(B . ?,Li(B) 
(?$,1(j(B . ?,Lj(B) (?$,1(k(B . ?,Lk(B)
!    (?$,1(l(B . ?,Ll(B) (?$,1(m(B . ?,Lm(B) (?$,1(n(B . ?,Ln(B) 
(?$,1(o(B . ?,Lo(B) (?$,1(q(B . ?,Lq(B)
!    (?$,1(r(B . ?,Lr(B) (?$,1(s(B . ?,Ls(B) (?$,1(t(B . ?,Lt(B) 
(?$,1(u(B . ?,Lu(B) (?$,1(v(B . ?,Lv(B)
!    (?$,1(w(B . ?,Lw(B) (?$,1(x(B . ?,Lx(B) (?$,1(y(B . ?,Ly(B) 
(?$,1(z(B . ?,Lz(B) (?$,1({(B . ?,L{(B)
!    (?$,1(|(B . ?,L|(B) (?$,1(~(B . ?,L~(B) (?$,1((B . ?,L(B)))
! 
! (defcustom utf-8-fragment-on-decoding nil
!   "Whether or not to decode some scripts in UTF-8 text into 8-bit characters.
! Setting this means that the relevant Cyrillic and Greek characters are
! decoded into the iso8859 charsets rather than into
! mule-unicode-0100-24ff.  The 8-bit characters take half as much space
! in the buffer, but using them may affect how the buffer can be re-encoded
! and may require a different input method to search for them, for instance.
! See `unify-8859-on-decoding-mode' and `unify-8859-on-encoding-mode'
! for mechanisms to make this largely transparent."
!   :set (lambda (s v)
!        (if v
!            (define-translation-table 'utf-8-translation-table-for-decode
!              utf-8-translation-table-for-decode)
!          (define-translation-table 'utf-8-translation-table-for-decode))
!        (set-default s v))
!   :version "21.4"
!   :type 'boolean
!   :group 'mule)
! 
! (defcustom utf-8-translate-cjk nil
!   "Whether the `mule-utf-8' coding system should encode many CJK characters.
! 
! Enabling this loads tables which enable the coding system to encode
! characters in the charsets `korean-ksc5601', `chinese-gb2312' and
! `japanese-jisx0208', and to decode the corresponding unicodes into
! such characters.  This works by loading the library `utf-8-subst'; see
! its commentary.  The tables are fairly large (about 33000 entries), so this
! option is not the default."
!   :link '(emacs-commentary-link "utf-8-subst")
!   :set (lambda (s v)
!        (when v
!          (require 'utf-8-subst)
!          (let ((table (make-char-table 'translation-table)))
!            (coding-system-put 'mule-utf-8 'safe-charsets
!                               (append (coding-system-get 'mule-utf-8
!                                                          'safe-charsets)
!                                       '(korean-ksc5601 chinese-gb2312
!                                                        japanese-jisx0208)))
!            (maphash (lambda (k v)
!                       (aset table k v))
!                     utf-8-subst-rev-table)
!            (register-char-codings 'mule-utf-8 table)))
!        (set-default s v))
!   :version "21.4"
!   :type 'boolean
!   :group 'mule)
! 
  (define-ccl-program ccl-decode-mule-utf-8
    ;;
    ;;        charset         | bytes in utf-8 | bytes in emacs
***************
*** 90,155 ****
        ;; 1byte encoding, i.e., ascii
        (if (r0 < #x80)
          (write r0)
  
!       ;; 2 byte encoding 00000yyyyyxxxxxx = 110yyyyy 10xxxxxx
!       (if (r0 < #xe0)
!           ((read r1)
! 
!            (if ((r1 & #b11000000) != #b10000000)
!                ;; Invalid 2-byte sequence
!                ((if (r0 < #xa0)
!                     (write-multibyte-character r5 r0)
!                   (write-multibyte-character r6 r0))
!                 (if (r1 < #x80)
!                     (write r1)
!                   (if (r1 < #xa0)
!                       (write-multibyte-character r5 r1)
!                     (write-multibyte-character r6 r1))))
! 
!              ((r0 &= #x1f)
!               (r0 <<= 6)
!               (r1 &= #x3f)
!               (r1 += r0)
!               ;; Now r1 holds scalar value
! 
!               ;; eight-bit-control
!               (if (r1 < 160)
!                   ((write-multibyte-character r5 r1))
! 
!                 ;; latin-iso8859-1
!                 (if (r1 < 256)
!                     ((r0 = ,(charset-id 'latin-iso8859-1))
!                      (r1 -= 128)
!                      (write-multibyte-character r0 r1))
! 
!                   ;; mule-unicode-0100-24ff (< 0800)
!                   ((r0 = ,(charset-id 'mule-unicode-0100-24ff))
!                    (r1 -= #x0100)
!                    (r2 = (((r1 / 96) + 32) << 7))
!                    (r1 %= 96)
!                    (r1 += (r2 + 32))
!                    (write-multibyte-character r0 r1)))))))
! 
!         ;; 3byte encoding
!         ;; zzzzyyyyyyxxxxxx = 1110zzzz 10yyyyyy 10xxxxxx
!         (if (r0 < #xf0)
!             ((read r1 r2)
! 
!              ;; This is set to 1 if the encoding is invalid.
!              (r4 = 0)
! 
!              (r3 = (r1 & #b11000000))
!              (r3 |= ((r2 >> 2) & #b00110000))
!              (if (r3 != #b10100000)
!                  (r4 = 1)
!                ((r3 = ((r0 & #x0f) << 12))
!                 (r3 += ((r1 & #x3f) << 6))
!                 (r3 += (r2 & #x3f))
!                 (if (r3 < #x0800)
!                     (r4 = 1))))
! 
!              (if (r4 != 0)
!                  ;; Invalid 3-byte sequence
                   ((if (r0 < #xa0)
                        (write-multibyte-character r5 r0)
                      (write-multibyte-character r6 r0))
--- 202,217 ----
        ;; 1byte encoding, i.e., ascii
        (if (r0 < #x80)
          (write r0)
+       (if (r0 < #xc0)             ; continuation byte (invalid here)
+           (if (r0 < #xa0)
+               (write-multibyte-character r5 r0)
+             (write-multibyte-character r6 r0))
+         ;; 2 byte encoding 00000yyyyyxxxxxx = 110yyyyy 10xxxxxx
+         (if (r0 < #xe0)
+             ((read r1)
  
!              (if ((r1 & #b11000000) != #b10000000)
!                  ;; Invalid 2-byte sequence
                   ((if (r0 < #xa0)
                        (write-multibyte-character r5 r0)
                      (write-multibyte-character r6 r0))
***************
*** 157,231 ****
                        (write r1)
                      (if (r1 < #xa0)
                          (write-multibyte-character r5 r1)
!                       (write-multibyte-character r6 r1)))
!                   (if (r2 < #x80)
!                       (write r2)
!                     (if (r2 < #xa0)
!                         (write-multibyte-character r5 r2)
!                       (write-multibyte-character r6 r2))))
                 
!                ;; mule-unicode-0100-24ff (>= 0800)
!                ((if (r3 < #x2500)
!                     ((r0 = ,(charset-id 'mule-unicode-0100-24ff))
!                      (r3 -= #x0100)
!                      (r3 //= 96)
!                      (r1 = (r7 + 32))
!                      (r1 += ((r3 + 32) << 7))
!                      (write-multibyte-character r0 r1))
!                   
!                   ;; mule-unicode-2500-33ff
!                   (if (r3 < #x3400)
!                       ((r0 = ,(charset-id 'mule-unicode-2500-33ff))
!                        (r3 -= #x2500)
                         (r3 //= 96)
                         (r1 = (r7 + 32))
                         (r1 += ((r3 + 32) << 7))
                         (write-multibyte-character r0 r1))
!                     
!                     ;; U+3400 .. U+DFFF
!                   ;; keep those bytes as eight-bit-{control|graphic}
!                     (if (r3 < #xe000)
!                         ( ;; #xe0 <= r0 < #xf0, so r0 is eight-bit-graphic
!                          (r3 = r6)
!                          (write-multibyte-character r3 r0)
!                          (if (r1 < #xa0)
!                              (r3 = r5))
!                          (write-multibyte-character r3 r1)
!                          (if (r2 < #xa0)
!                              (r3 = r5)
!                            (r3 = r6))
!                          (write-multibyte-character r3 r2))
                        
!                       ;; mule-unicode-e000-ffff
!                       ((r0 = ,(charset-id 'mule-unicode-e000-ffff))
!                        (r3 -= #xe000)
!                        (r3 //= 96)
!                        (r1 = (r7 + 32))
!                        (r1 += ((r3 + 32) << 7))
!                        (write-multibyte-character r0 r1))))))))
! 
!           ;; 4byte encoding
!           ;; keep those bytes as eight-bit-{control|graphic}
!           ((read r1 r2 r3)
!            ;; r0 > #xf0, thus eight-bit-graphic
!            (write-multibyte-character r6 r0)
!            (if (r1 < #xa0)
!                (write-multibyte-character r5 r1)
!              (write-multibyte-character r6 r1))
!            (if (r2 < #xa0)
!                (write-multibyte-character r5 r2)
!              (write-multibyte-character r6 r2))
!            (if (r3 < #xa0)
!                (write-multibyte-character r5 r3)
!              (write-multibyte-character r6 r3))))))
! 
        (repeat))))
  
    "CCL program to decode UTF-8.
  Basic decoding is done into the charsets ascii, latin-iso8859-1 and
! mule-unicode-*.  Encodings of un-representable Unicode characters are
! decoded asis into eight-bit-control and eight-bit-graphic
! characters.")
  
  (define-ccl-program ccl-encode-mule-utf-8
    `(1
--- 219,413 ----
                        (write r1)
                      (if (r1 < #xa0)
                          (write-multibyte-character r5 r1)
!                       (write-multibyte-character r6 r1))))
! 
!                ((r3 = r0)        ; save in case of overlong sequence
!                 (r2 = r1)
!                 (r0 &= #x1f)
!                 (r0 <<= 6)
!                 (r2 = r1)        ; save in case of overlong sequence
!                 (r1 &= #x3f)
!                 (r1 += r0)
!                 ;; Now r1 holds scalar value
! 
!                 (if (r1 < 128)        ; `overlong sequence'
!                     ((if (r3 < #xa0)
!                          (write-multibyte-character r5 r3)
!                        (write-multibyte-character r6 r3))
!                      (if (r2 < #x80)
!                          (write r2)
!                        (if (r2 < #xa0)
!                            (write-multibyte-character r5 r2)
!                          (write-multibyte-character r6 r2))))
! 
!                   ;; eight-bit-control
!                   (if (r1 < 160)
!                       ((write-multibyte-character r5 r1))
! 
!                     ;; latin-iso8859-1
!                     (if (r1 < 256)
!                         ((r0 = ,(charset-id 'latin-iso8859-1))
!                          (r1 -= 128)
!                          (write-multibyte-character r0 r1))
! 
!                       ;; mule-unicode-0100-24ff (< 0800)
!                       ((r0 = ,(charset-id 'mule-unicode-0100-24ff))
!                        (r1 -= #x0100)
!                        (r2 = (((r1 / 96) + 32) << 7))
!                        (r1 %= 96)
!                        (r1 += (r2 + 32))
!                        (translate-character
!                         utf-8-translation-table-for-decode r0 r1)
!                        (write-multibyte-character r0 r1))))))))
! 
!           ;; 3byte encoding
!           ;; zzzzyyyyyyxxxxxx = 1110zzzz 10yyyyyy 10xxxxxx
!           (if (r0 < #xf0)
!               ((read r1 r2)
! 
!                ;; This is set to 1 if the encoding is invalid.
!                (r4 = 0)
! 
!                (r3 = (r1 & #b11000000))
!                (r3 |= ((r2 >> 2) & #b00110000))
!                (if (r3 != #b10100000)
!                    (r4 = 1)
!                  ((r3 = ((r0 & #x0f) << 12))
!                   (r3 += ((r1 & #x3f) << 6))
!                   (r3 += (r2 & #x3f))
!                   (if (r3 < #x0800)
!                       (r4 = 1))))
! 
!                (if (r4 != 0)
!                    ;; Invalid 3-byte sequence
!                    ((if (r0 < #xa0)
!                         (write-multibyte-character r5 r0)
!                       (write-multibyte-character r6 r0))
!                     (if (r1 < #x80)
!                         (write r1)
!                       (if (r1 < #xa0)
!                           (write-multibyte-character r5 r1)
!                         (write-multibyte-character r6 r1)))
!                     (if (r2 < #x80)
!                         (write r2)
!                       (if (r2 < #xa0)
!                           (write-multibyte-character r5 r2)
!                         (write-multibyte-character r6 r2))))
                 
!                  ;; mule-unicode-0100-24ff (>= 0800)
!                  ((if (r3 < #x2500)
!                       ((r0 = ,(charset-id 'mule-unicode-0100-24ff))
!                        (r3 -= #x0100)
                         (r3 //= 96)
                         (r1 = (r7 + 32))
                         (r1 += ((r3 + 32) << 7))
+                        (translate-character
+                         utf-8-translation-table-for-decode r0 r1)
                         (write-multibyte-character r0 r1))
!                   
!                     ;; mule-unicode-2500-33ff
!                     ;; Fixme: Perhaps allow translation via
!                     ;; utf-8-subst-table for #x2e80 up, so that we use
!                     ;; consistent charsets for all of CJK.  Would need
!                     ;; corresponding change to encoding tables.
!                     (if (r3 < #x3400)
!                         ((r0 = ,(charset-id 'mule-unicode-2500-33ff))
!                          (r3 -= #x2500)
!                          (r3 //= 96)
!                          (r1 = (r7 + 32))
!                          (r1 += ((r3 + 32) << 7))
!                          (write-multibyte-character r0 r1))
! 
!                       ;; U+3400 .. U+D7FF
!                       ;; Try to convert to CJK chars, else keep
!                       ;; them as eight-bit-{control|graphic}.
!                       (if (r3 < #xd800)
!                           ((r4 = r3)  ; don't zap r3
!                            (lookup-integer utf-8-subst-table r4 r5)
!                            (if r7
!                                ;; got a translation
!                                ((write-multibyte-character r4 r5)
!                                 ;; Zapped through register starvation.
!                                 (r5 = ,(charset-id 'eight-bit-control)))
!                              ;; #xe0 <= r0 < #xf0, so r0 is eight-bit-graphic
!                              ((r3 = r6)
!                               (write-multibyte-character r3 r0)
!                               (if (r1 < #xa0)
!                                   (r3 = r5))
!                               (write-multibyte-character r3 r1)
!                               (if (r2 < #xa0)
!                                   (r3 = r5)
!                                 (r3 = r6))
!                               (write-multibyte-character r3 r2))))
! 
!                         ;; Surrogates, U+D800 .. U+DFFF
!                         ;; Fixme: process them properly.
!                         (if (r3 < #xe000)
!                             ((r3 = r6)
!                              (write-multibyte-character r3 r0) ; 
eight-bit-graphic
!                              (if (r1 < #xa0)
!                                  (r3 = r5))
!                              (write-multibyte-character r3 r1)
!                              (if (r2 < #xa0)
!                                  (r3 = r5)
!                                (r3 = r6))
!                              (write-multibyte-character r3 r2))
                        
!                           ;; mule-unicode-e000-ffff
!                           ;; Fixme: fffe and ffff are invalid.
!                           ((r0 = ,(charset-id 'mule-unicode-e000-ffff))
!                            (r3 -= #xe000)
!                            (r3 //= 96)
!                            (r1 = (r7 + 32))
!                            (r1 += ((r3 + 32) << 7))
!                            (write-multibyte-character r0 r1)))))))))
! 
!             (if (r0 < #xfe)
!                 ;; 4byte encoding
!                 ;; keep those bytes as eight-bit-{control|graphic}
!                 ;; Fixme: allow lookup in utf-8-subst-table.
!                 ((read r1 r2 r3)
!                  ;; r0 > #xf0, thus eight-bit-graphic
!                  (write-multibyte-character r6 r0)
!                  (if (r1 < #xa0)
!                      (if (r1 < #x80)  ; invalid byte
!                          (write r1)
!                        (write-multibyte-character r5 r1))
!                    (write-multibyte-character r6 r1))
!                  (if (r2 < #xa0)
!                      (if (r2 < #x80)  ; invalid byte
!                          (write r2)
!                        (write-multibyte-character r5 r2))
!                    (write-multibyte-character r6 r2))
!                  (if (r3 < #xa0)
!                      (if (r3 < #x80)  ; invalid byte
!                          (write r3)
!                        (write-multibyte-character r5 r3))
!                    (write-multibyte-character r6 r3))
!                  (if (r0 >= #xf8)     ; 5- or 6-byte encoding
!                      ((read r1)
!                       (if (r1 < #xa0)
!                           (if (r1 < #x80) ; invalid byte
!                               (write r1)
!                             (write-multibyte-character r5 r1))
!                         (write-multibyte-character r6 r1))
!                       (if (r0 >= #xfc) ; 6-byte
!                           ((read r1)
!                            (if (r1 < #xa0)
!                                (if (r1 < #x80) ; invalid byte
!                                    (write r1)
!                                  (write-multibyte-character r5 r1))
!                              (write-multibyte-character r6 r1)))))))
!               ;; else invalid byte >= #xfe
!               (write-multibyte-character r6 r0))))))
        (repeat))))
  
    "CCL program to decode UTF-8.
  Basic decoding is done into the charsets ascii, latin-iso8859-1 and
! mule-unicode-*, but see also `utf-8-translation-table-for-decode' and
! `utf-8-subst-table'.
! Encodings of un-representable Unicode characters are decoded asis into
! eight-bit-control and eight-bit-graphic characters.")
  
  (define-ccl-program ccl-encode-mule-utf-8
    `(1
***************
*** 288,294 ****
              (if (r0 == ,(charset-id 'mule-unicode-e000-ffff))
                  ((r0 = ((((r1 & #x3f80) >> 7) - 32) * 96))
                   (r1 &= #x7f)
!                  (r1 += (r0 + 57312)) ; 57312 == -160 + #xe000
                   (r0 = (((r1 & #xf000) >> 12) | #xe0))
                   (r2 = ((r1 & #x3f) | #x80))
                   (r1 &= #x0fc0)
--- 470,476 ----
              (if (r0 == ,(charset-id 'mule-unicode-e000-ffff))
                  ((r0 = ((((r1 & #x3f80) >> 7) - 32) * 96))
                   (r1 &= #x7f)
!                  (r1 += (r0 + 57312)) ; 57312 == -32 + #xe000
                   (r0 = (((r1 & #xf000) >> 12) | #xe0))
                   (r2 = ((r1 & #x3f) | #x80))
                   (r1 &= #x0fc0)
***************
*** 329,339 ****
                                ((write #xc2)
                                 (write r1)))))))
  
!                   ;; Unsupported character.
!                   ;; Output U+FFFD, which is `ef bf bd' in UTF-8.
!                   ((write #xef)
!                    (write #xbf)
!                    (write #xbd)))))))))
        (repeat)))
      (if (r1 >= #xa0)
        (write r1)
--- 511,529 ----
                                ((write #xc2)
                                 (write r1)))))))
  
!                   ((lookup-character utf-8-subst-rev-table r0 r1)
!                    (if r7             ; lookup succeeded
!                        ((r1 = (((r0 & #xf000) >> 12) | #xe0))
!                         (r2 = ((r0 & #x3f) | #x80))
!                         (r0 &= #x0fc0)
!                         (r0 >>= 6)
!                         (r0 |= #x80)
!                         (write r1 r0 r2))
!                      ;; Unsupported character.
!                      ;; Output U+FFFD, which is `ef bf bd' in UTF-8.
!                      ((write #xef)
!                       (write #xbf)
!                       (write #xbd)))))))))))
        (repeat)))
      (if (r1 >= #xa0)
        (write r1)
***************
*** 341,409 ****
          ((write #xc2)
           (write r1)))))
  
!   "CCL program to encode into UTF-8.
! Only characters from the charsets ascii, eight-bit-control,
! eight-bit-graphic, latin-iso8859-1 and mule-unicode-* are recognized.
! Others are encoded as U+FFFD.")
  
  ;; Dummy definition so that the CCL can be checked correctly; the
  ;; actual data are loaded on demand.
  (unless (boundp 'ucs-mule-8859-to-mule-unicode)       ; don't zap it
    (define-translation-table 'ucs-mule-8859-to-mule-unicode))
  
  (defsubst utf-8-untranslated-to-ucs ()
!   (let ((b1 (char-after))
!       (b2 (char-after (1+ (point))))
!       (b3 (char-after (+ 2 (point))))
!       (b4 (char-after (+ 4 (point)))))
!     (if (and b1 b2 b3)
!       (cond ((< b1 ?\xf0)
!              (setq b2 (lsh (logand b2 ?\x3f) 6))
!              (setq b3 (logand b3 ?\x3f))
!              (logior b3 (logior b2 (lsh (logand b1 ?\x0f) 12))))
!             (b4
!              (setq b2 (lsh (logand b2 ?\x3f) 12))
!              (setq b3 (lsh (logand b3 ?\x3f) 6))
!              (setq b4 (logand b4 ?\x3f))
!              (logior b4 (logior b3 (logior b2 (lsh (logand b1 ?\x07)
!                                                    18)))))))))
  
  (defun utf-8-help-echo (window object position)
    (format "Untranslated Unicode U+%04X"
          (get-char-property position 'untranslated-utf-8 object)))
  
- (defvar utf-8-subst-table nil
-   "If non-nil, a hash table mapping `untranslatable utf-8' to Emacs 
characters.")
- 
  ;; We compose the untranslatable sequences into a single character.
  ;; This is infelicitous for editing, because there's currently no
  ;; mechanism for treating compositions as atomic, but is OK for
! ;; display.  We try to compose an appropriate character from a hash
! ;; table of CJK characters to display correctly.  Otherwise we use
! ;; U+FFFD.  What we really should have is hash table lookup from CCL
! ;; so that we could do this properly.  This function GCs too much.
  (defsubst utf-8-compose ()
    "Put a suitable composition on an untranslatable sequence.
  Return the sequence's length."
    (let* ((u (utf-8-untranslated-to-ucs))
!        (l (and u (if (>= u ?\x10000)
                       4
!                    3)))
!        (subst (and utf-8-subst-table (gethash u utf-8-subst-table))))
!     (when u
        (put-text-property (point) (min (point-max) (+ l (point)))
                         'untranslated-utf-8 u)
!       (unless subst
!         (put-text-property (point) (min (point-max) (+ l (point)))
!                            'help-echo 'utf-8-help-echo)
!         (setq subst ?$,3u=(B))
!       (compose-region (point) (+ l (point)) subst)
        l)))
  
  (defcustom utf-8-compose-scripts nil
!   "*Non-nil means compose various scipts on decoding utf-8 text."
    :group 'mule
!   :type 'boolean)     ; omitted in Emacs 21.1
  
  (defun utf-8-post-read-conversion (length)
    "Compose untranslated utf-8 sequences into single characters.
--- 531,619 ----
          ((write #xc2)
           (write r1)))))
  
!   "CCL program to encode into UTF-8.")
  
  ;; Dummy definition so that the CCL can be checked correctly; the
  ;; actual data are loaded on demand.
  (unless (boundp 'ucs-mule-8859-to-mule-unicode)       ; don't zap it
    (define-translation-table 'ucs-mule-8859-to-mule-unicode))
  
+ (define-ccl-program ccl-untranslated-to-ucs
+   `(0
+     (if (r0 < #xf0)                   ; 3-byte encoding, as above
+       ((r4 = 0)
+        (r3 = (r1 & #b11000000))
+        (r3 |= ((r2 >> 2) & #b00110000))
+        (if (r3 != #b10100000)
+            (r4 = 1)
+          ((r3 = ((r0 & #x0f) << 12))
+           (r3 += ((r1 & #x3f) << 6))
+           (r3 += (r2 & #x3f))
+           (if (r3 < #x0800)
+               (r4 = 1))))
+        (if (r4 != 0)
+            (r0 = 0)
+          (r0 = r3)))
+       (if (r0 < #xf8)                 ; 4-byte (Mule-UCS recipe)
+         ((r4 = (r1 >> 6))
+          (if (r4 != #b10)
+              (r0 = 0)
+            ((r4 = (r2 >> 6))
+             (if (r4 != #b10)
+                 (r0 = 0)
+               ((r4 = (r3 >> 6))
+                (if (r4 != #b10)
+                    (r0 = 0)
+                  ((r1 = ((r1  & #x3F) << 12))
+                   (r2 = ((r2  & #x3F) << 6))
+                   (r3 &= #x3F)
+                   (r0 = (((((r0 & #x07) << 18) | r1) | r2) | r3)))))))))
+       (r0 = 0))))
+   "Decode 3- or 4-byte sequences in r0, r1, r2 [,r3] to unicodes in r0.
+ r0 == 0 for invalid sequence.")
+ 
+ (defvar utf-8-ccl-regs (make-vector 8 0))
+ 
  (defsubst utf-8-untranslated-to-ucs ()
!   "Return the UCS code for an untranslated sequence of raw bytes t point.
! Only for 3- or 4-byte sequences."
!   (aset utf-8-ccl-regs 0 (or (char-after) 0))
!   (aset utf-8-ccl-regs 1 (or (char-after (1+ (point))) 0))
!   (aset utf-8-ccl-regs 2 (or (char-after (+ 2 (point))) 0))
!   (aset utf-8-ccl-regs 3 (or (char-after (+ 3 (point))) 0))
!   (ccl-execute 'ccl-untranslated-to-ucs utf-8-ccl-regs)
!   (aref utf-8-ccl-regs 0))
  
  (defun utf-8-help-echo (window object position)
    (format "Untranslated Unicode U+%04X"
          (get-char-property position 'untranslated-utf-8 object)))
  
  ;; We compose the untranslatable sequences into a single character.
  ;; This is infelicitous for editing, because there's currently no
  ;; mechanism for treating compositions as atomic, but is OK for
! ;; display.  They are composed to U+FFFD with help-echo which
! ;; indicates the unicodes they represent.  This function GCs too much.
  (defsubst utf-8-compose ()
    "Put a suitable composition on an untranslatable sequence.
  Return the sequence's length."
    (let* ((u (utf-8-untranslated-to-ucs))
!        (l (unless (zerop u)
!             (if (>= u #x10000)
                       4
!                    3))))
!     (when l
        (put-text-property (point) (min (point-max) (+ l (point)))
                         'untranslated-utf-8 u)
!       (put-text-property (point) (min (point-max) (+ l (point)))
!                        'help-echo 'utf-8-help-echo)
!       (compose-region (point) (+ l (point)) ?$,3u=(B)
        l)))
  
  (defcustom utf-8-compose-scripts nil
!   "*Non-nil means compose various scripts on decoding utf-8 text."
    :group 'mule
!   :version "21.4"
!   :type 'boolean)
  
  (defun utf-8-post-read-conversion (length)
    "Compose untranslated utf-8 sequences into single characters.
***************
*** 412,449 ****
      ;; Can't do eval-when-compile to insert a multibyte constant
      ;; version of the string in the loop, since it's always loaded as
      ;; unibyte from a byte-compiled file.
!     (let ((range (string-as-multibyte "^\341-\377"))) 
!       (while (and (skip-chars-forward
!                  range)
                  (not (eobp)))
        (forward-char (utf-8-compose)))))
!   ;; Fixme: Takahashi-san implies it may not work this easily -- needs
!   ;; checking with him.
    (when (and utf-8-compose-scripts (> length 1))
      ;; These currently have definitions which cover the relevant
!     ;; Unicodes.  We could avoid loading thai-util &c by checking
      ;; whether the region contains any characters with the appropriate
      ;; categories.  There aren't yet Unicode-based rules for Tibetan.
      (save-excursion (setq length (diacritic-post-read-conversion length)))
      (save-excursion (setq length (thai-post-read-conversion length)))
      (save-excursion (setq length (lao-post-read-conversion length)))
!     (save-excursion (setq length (devanagari-post-read-conversion length))))
    length)
  
! (defun utf-8-pre-write-conversion (beg end)
!   "Semi-dummy pre-write function effectively to autoload ucs-tables."
!   ;; Ensure translation table is loaded.
!   (require 'ucs-tables)
!   ;; Don't do this again.
!   (coding-system-put 'mule-utf-8 'pre-write-conversion nil)
!   nil)
  
  (make-coding-system
   'mule-utf-8 4 ?u
   "UTF-8 encoding for Emacs-supported Unicode characters.
! The supported Emacs character sets are the following, plus others
! which may be included in the translation table
! `ucs-mule-to-mule-unicode':
   ascii
   eight-bit-control
   eight-bit-graphic
--- 622,660 ----
      ;; Can't do eval-when-compile to insert a multibyte constant
      ;; version of the string in the loop, since it's always loaded as
      ;; unibyte from a byte-compiled file.
!     (let ((range (string-as-multibyte "^\xe1-\xf7")))
!       (while (and (skip-chars-forward range)
                  (not (eobp)))
        (forward-char (utf-8-compose)))))
!   ;; Fixme: Takahashi-san implies it may not work this easily.  I
!   ;; asked why but didn't get a reply. -- fx
    (when (and utf-8-compose-scripts (> length 1))
      ;; These currently have definitions which cover the relevant
!     ;; unicodes.  We could avoid loading thai-util &c by checking
      ;; whether the region contains any characters with the appropriate
      ;; categories.  There aren't yet Unicode-based rules for Tibetan.
      (save-excursion (setq length (diacritic-post-read-conversion length)))
      (save-excursion (setq length (thai-post-read-conversion length)))
      (save-excursion (setq length (lao-post-read-conversion length)))
!     (save-excursion
!       (setq length (in-is13194-devanagari-post-read-conversion length))))
    length)
  
! ;; ucs-tables is preloaded
! ;; (defun utf-8-pre-write-conversion (beg end)
! ;;   "Semi-dummy pre-write function effectively to autoload ucs-tables."
! ;;   ;; Ensure translation table is loaded.
! ;;   (require 'ucs-tables)
! ;;   ;; Don't do this again.
! ;;   (coding-system-put 'mule-utf-8 'pre-write-conversion nil)
! ;;   nil)
  
  (make-coding-system
   'mule-utf-8 4 ?u
   "UTF-8 encoding for Emacs-supported Unicode characters.
! The supported Emacs character sets are the following, plus any other
! characters included in the tables `ucs-mule-to-mule-unicode' and
! `utf-8-subst-rev-table':
   ascii
   eight-bit-control
   eight-bit-graphic
***************
*** 462,471 ****
   mule-unicode-e000-ffff
  
  Unicode characters out of the ranges U+0000-U+33FF and U+E200-U+FFFF
! are decoded into sequences of eight-bit-control and eight-bit-graphic
! characters to preserve their byte sequences and composed to display as
! a single character.  Emacs characters that can't be encoded to these
! ranges are encoded as U+FFFD."
  
   '(ccl-decode-mule-utf-8 . ccl-encode-mule-utf-8)
   '((safe-charsets
--- 673,684 ----
   mule-unicode-e000-ffff
  
  Unicode characters out of the ranges U+0000-U+33FF and U+E200-U+FFFF
! may be decoded into korean-ksc5601, chinese-gb2312, japanese-jisx0208
! \(see user option `utf-8-translate-cjk'); otherwise, sequences of
! eight-bit-control and eight-bit-graphic characters are used to
! preserve their byte sequences, and these are composed to display as a
! single character.  Emacs characters that otherwise can't be encoded
! are encoded as U+FFFD."
  
   '(ccl-decode-mule-utf-8 . ccl-encode-mule-utf-8)
   '((safe-charsets
***************
*** 497,503 ****
     (mime-charset . utf-8)
     (coding-category . coding-category-utf-8)
     (valid-codes (0 . 255))
!    (pre-write-conversion . utf-8-pre-write-conversion)
     (post-read-conversion . utf-8-post-read-conversion)))
  
  (define-coding-system-alias 'utf-8 'mule-utf-8)
--- 710,716 ----
     (mime-charset . utf-8)
     (coding-category . coding-category-utf-8)
     (valid-codes (0 . 255))
! ;;    (pre-write-conversion . utf-8-pre-write-conversion)
     (post-read-conversion . utf-8-post-read-conversion)))
  
  (define-coding-system-alias 'utf-8 'mule-utf-8)



reply via email to

[Prev in Thread] Current Thread [Next in Thread]