emacs-diffs
[Top][All Lists]
Advanced

[Date Prev][Date Next][Thread Prev][Thread Next][Date Index][Thread Index]

[Emacs-diffs] Changes to emacs/lisp/international/utf-8.el [gnus-5_10-br


From: Miles Bader
Subject: [Emacs-diffs] Changes to emacs/lisp/international/utf-8.el [gnus-5_10-branch]
Date: Sat, 04 Sep 2004 08:36:43 -0400

Index: emacs/lisp/international/utf-8.el
diff -c /dev/null emacs/lisp/international/utf-8.el:1.40.2.1
*** /dev/null   Sat Sep  4 12:01:42 2004
--- emacs/lisp/international/utf-8.el   Sat Sep  4 12:01:09 2004
***************
*** 0 ****
--- 1,991 ----
+ ;;; utf-8.el --- UTF-8 decoding/encoding support -*- coding: iso-2022-7bit -*-
+ 
+ ;; Copyright (C) 2001, 2004 Electrotechnical Laboratory, JAPAN.
+ ;; Licensed to the Free Software Foundation.
+ ;; Copyright (C) 2001, 2002 Free Software Foundation, Inc.
+ 
+ ;; Author: TAKAHASHI Naoto  <address@hidden>
+ ;; Maintainer: FSF
+ ;; Keywords: multilingual, Unicode, UTF-8, i18n
+ 
+ ;; This file is part of GNU Emacs.
+ 
+ ;; GNU Emacs is free software; you can redistribute it and/or modify
+ ;; it under the terms of the GNU General Public License as published by
+ ;; the Free Software Foundation; either version 2, or (at your option)
+ ;; any later version.
+ 
+ ;; GNU Emacs is distributed in the hope that it will be useful,
+ ;; but WITHOUT ANY WARRANTY; without even the implied warranty of
+ ;; MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
+ ;; GNU General Public License for more details.
+ 
+ ;; You should have received a copy of the GNU General Public License
+ ;; along with GNU Emacs; see the file COPYING.  If not, write to the
+ ;; Free Software Foundation, Inc., 59 Temple Place - Suite 330,
+ ;; Boston, MA 02111-1307, USA.
+ 
+ ;;; Commentary:
+ 
+ ;; The coding-system `mule-utf-8' basically supports encoding/decoding
+ ;; of the following character sets to and from UTF-8:
+ ;;
+ ;;   ascii
+ ;;   eight-bit-control
+ ;;   latin-iso8859-1
+ ;;   mule-unicode-0100-24ff
+ ;;   mule-unicode-2500-33ff
+ ;;   mule-unicode-e000-ffff
+ ;;
+ ;; On decoding, Unicode characters that do not fit into the above
+ ;; character sets are handled as `eight-bit-control' or
+ ;; `eight-bit-graphic' characters to retain the information about the
+ ;; original byte sequence and text properties record the corresponding
+ ;; unicode.
+ ;;
+ ;; Fixme: note that reading and writing invalid utf-8 may not be
+ ;; idempotent -- to represent the bytes to fix that needs a new charset.
+ ;;
+ ;; Characters from other character sets can be encoded with mule-utf-8
+ ;; by populating the translation table
+ ;; `utf-translation-table-for-encode'.  Hash tables
+ ;; `utf-subst-table-for-decode' and `utf-subst-table-for-encode' are
+ ;; used to support encoding and decoding of about a quarter of the CJK
+ ;; space between U+3400 and U+DFFF.
+ 
+ ;; UTF-8 is defined in RFC 3629.  A sketch of the encoding is:
+ 
+ ;;        scalar       |               utf-8
+ ;;        value        | 1st byte  | 2nd byte  | 3rd byte
+ ;; --------------------+-----------+-----------+----------
+ ;; 0000 0000 0xxx xxxx | 0xxx xxxx |           |
+ ;; 0000 0yyy yyxx xxxx | 110y yyyy | 10xx xxxx |
+ ;; zzzz yyyy yyxx xxxx | 1110 zzzz | 10yy yyyy | 10xx xxxx
+ 
+ ;;; Code:
+ 
+ (defvar ucs-mule-to-mule-unicode (make-char-table 'translation-table nil)
+   "Char table mapping characters to latin-iso8859-1 or mule-unicode-*.
+ 
+ If `unify-8859-on-encoding-mode' is non-nil, this table populates the
+ translation-table named `utf-translation-table-for-encode'.")
+ 
+ (define-translation-table 'utf-translation-table-for-encode)
+ 
+ 
+ ;; Map Cyrillic and Greek to iso-8859 charsets, which take half the
+ ;; space of mule-unicode.  For Latin scripts this isn't very
+ ;; important.  Hebrew and Arabic might go here too when there's proper
+ ;; support for them.
+ 
+ (defvar utf-fragmentation-table (make-char-table 'translation-table nil)
+   "Char-table normally mapping non-Latin mule-unicode-* chars to iso-8859-*.
+ 
+ If `utf-fragment-on-decoding' is non-nil, this table populates the
+ translation-table named `utf-translation-table-for-decode'")
+ 
+ (defvar utf-defragmentation-table (make-char-table 'translation-table nil)
+   "Char-table for reverse mapping of `utf-fragmentation-table'.
+ 
+ If `utf-fragment-on-decoding' is non-nil and
+ `unify-8859-on-encoding-mode' is nil, this table populates the
+ translation-table named `utf-translation-table-for-encode'")
+ 
+ (define-translation-table 'utf-translation-table-for-decode)
+ 
+ 
+ (defvar ucs-mule-cjk-to-unicode (make-hash-table :test 'eq)
+   "Hash table mapping Emacs CJK character sets to Unicode code points.
+ 
+ If `utf-translate-cjk-mode' is non-nil, this table populates the
+ translation-hash-table named `utf-subst-table-for-encode'.")
+ 
+ (define-translation-hash-table 'utf-subst-table-for-encode
+   ucs-mule-cjk-to-unicode)
+ 
+ (defvar ucs-unicode-to-mule-cjk (make-hash-table :test 'eq)
+   "Hash table mapping Unicode code points to Emacs CJK character sets.
+ 
+ If `utf-translate-cjk-mode' is non-nil, this table populates the
+ translation-hash-table named `utf-subst-table-for-decode'.")
+ 
+ (define-translation-hash-table 'utf-subst-table-for-decode
+   ucs-unicode-to-mule-cjk)
+ 
+ (mapc
+  (lambda (pair)
+    (aset utf-fragmentation-table (car pair) (cdr pair))
+    (aset utf-defragmentation-table (cdr pair) (car pair)))
+  '((?$,1&d(B . ?,F4(B) (?$,1&e(B . ?,F5(B) (?$,1&f(B . ?,F6(B) 
(?$,1&h(B . ?,F8(B) (?$,1&i(B . ?,F9(B)
+    (?$,1&j(B . ?,F:(B) (?$,1&l(B . ?,F<(B) (?$,1&n(B . ?,F>(B) 
(?$,1&o(B . ?,F?(B) (?$,1&p(B . ?,address@hidden(B)
+    (?$,1&q(B . ?,FA(B) (?$,1&r(B . ?,FB(B) (?$,1&s(B . ?,FC(B) 
(?$,1&t(B . ?,FD(B) (?$,1&u(B . ?,FE(B)
+    (?$,1&v(B . ?,FF(B) (?$,1&w(B . ?,FG(B) (?$,1&x(B . ?,FH(B) 
(?$,1&y(B . ?,FI(B) (?$,1&z(B . ?,FJ(B)
+    (?$,1&{(B . ?,FK(B) (?$,1&|(B . ?,FL(B) (?$,1&}(B . ?,FM(B) 
(?$,1&~(B . ?,FN(B) (?$,1&(B . ?,FO(B)
+    (?$,1' (B . ?,FP(B) (?$,1'!(B . ?,FQ(B) (?$,1'#(B . ?,FS(B) 
(?$,1'$(B . ?,FT(B) (?$,1'%(B . ?,FU(B)
+    (?$,1'&(B . ?,FV(B) (?$,1''(B . ?,FW(B) (?$,1'((B . ?,FX(B) 
(?$,1')(B . ?,FY(B) (?$,1'*(B . ?,FZ(B)
+    (?$,1'+(B . ?,F[(B) (?$,1',(B . ?,F\(B) (?$,1'-(B . ?,F](B) 
(?$,1'.(B . ?,F^(B) (?$,1'/(B . ?,F_(B)
+    (?$,1'0(B . ?,F`(B) (?$,1'1(B . ?,Fa(B) (?$,1'2(B . ?,Fb(B) 
(?$,1'3(B . ?,Fc(B) (?$,1'4(B . ?,Fd(B)
+    (?$,1'5(B . ?,Fe(B) (?$,1'6(B . ?,Ff(B) (?$,1'7(B . ?,Fg(B) 
(?$,1'8(B . ?,Fh(B) (?$,1'9(B . ?,Fi(B)
+    (?$,1':(B . ?,Fj(B) (?$,1';(B . ?,Fk(B) (?$,1'<(B . ?,Fl(B) 
(?$,1'=(B . ?,Fm(B) (?$,1'>(B . ?,Fn(B)
+    (?$,1'?(B . ?,Fo(B) (?$,1'@(B . ?,Fp(B) (?$,1'A(B . ?,Fq(B) 
(?$,1'B(B . ?,Fr(B) (?$,1'C(B . ?,Fs(B)
+    (?$,1'D(B . ?,Ft(B) (?$,1'E(B . ?,Fu(B) (?$,1'F(B . ?,Fv(B) 
(?$,1'G(B . ?,Fw(B) (?$,1'H(B . ?,Fx(B)
+    (?$,1'I(B . ?,Fy(B) (?$,1'J(B . ?,Fz(B) (?$,1'K(B . ?,F{(B) 
(?$,1'L(B . ?,F|(B) (?$,1'M(B . ?,F}(B)
+    (?$,1'N(B . ?,F~(B)
+ 
+    (?$,1(!(B . ?,L!(B) (?$,1("(B . ?,L"(B) (?$,1(#(B . ?,L#(B) 
(?$,1($(B . ?,L$(B)
+    (?$,1(%(B . ?,L%(B) (?$,1(&(B . ?,L&(B) (?$,1('(B . ?,L'(B) 
(?$,1(((B . ?,L((B) (?$,1()(B . ?,L)(B)
+    (?$,1(*(B . ?,L*(B) (?$,1(+(B . ?,L+(B) (?$,1(,(B . ?,L,(B) 
(?$,1(.(B . ?,L.(B) (?$,1(/(B . ?,L/(B)
+    (?$,1(0(B . ?,L0(B) (?$,1(1(B . ?,L1(B) (?$,1(2(B . ?,L2(B) 
(?$,1(3(B . ?,L3(B) (?$,1(4(B . ?,L4(B)
+    (?$,1(5(B . ?,L5(B) (?$,1(6(B . ?,L6(B) (?$,1(7(B . ?,L7(B) 
(?$,1(8(B . ?,L8(B) (?$,1(9(B . ?,L9(B)
+    (?$,1(:(B . ?,L:(B) (?$,1(;(B . ?,L;(B) (?$,1(<(B . ?,L<(B) 
(?$,1(=(B . ?,L=(B) (?$,1(>(B . ?,L>(B)
+    (?$,1(?(B . ?,L?(B) (?$,1(@(B . ?,address@hidden(B) (?$,1(A(B . 
?,LA(B) (?$,1(B(B . ?,LB(B) (?$,1(C(B . ?,LC(B)
+    (?$,1(D(B . ?,LD(B) (?$,1(E(B . ?,LE(B) (?$,1(F(B . ?,LF(B) 
(?$,1(G(B . ?,LG(B) (?$,1(H(B . ?,LH(B)
+    (?$,1(I(B . ?,LI(B) (?$,1(J(B . ?,LJ(B) (?$,1(K(B . ?,LK(B) 
(?$,1(L(B . ?,LL(B) (?$,1(M(B . ?,LM(B)
+    (?$,1(N(B . ?,LN(B) (?$,1(O(B . ?,LO(B) (?$,1(P(B . ?,LP(B) 
(?$,1(Q(B . ?,LQ(B) (?$,1(R(B . ?,LR(B)
+    (?$,1(S(B . ?,LS(B) (?$,1(T(B . ?,LT(B) (?$,1(U(B . ?,LU(B) 
(?$,1(V(B . ?,LV(B) (?$,1(W(B . ?,LW(B)
+    (?$,1(X(B . ?,LX(B) (?$,1(Y(B . ?,LY(B) (?$,1(Z(B . ?,LZ(B) 
(?$,1([(B . ?,L[(B) (?$,1(\(B . ?,L\(B)
+    (?$,1(](B . ?,L](B) (?$,1(^(B . ?,L^(B) (?$,1(_(B . ?,L_(B) 
(?$,1(`(B . ?,L`(B) (?$,1(a(B . ?,La(B)
+    (?$,1(b(B . ?,Lb(B) (?$,1(c(B . ?,Lc(B) (?$,1(d(B . ?,Ld(B) 
(?$,1(e(B . ?,Le(B) (?$,1(f(B . ?,Lf(B)
+    (?$,1(g(B . ?,Lg(B) (?$,1(h(B . ?,Lh(B) (?$,1(i(B . ?,Li(B) 
(?$,1(j(B . ?,Lj(B) (?$,1(k(B . ?,Lk(B)
+    (?$,1(l(B . ?,Ll(B) (?$,1(m(B . ?,Lm(B) (?$,1(n(B . ?,Ln(B) 
(?$,1(o(B . ?,Lo(B) (?$,1(q(B . ?,Lq(B)
+    (?$,1(r(B . ?,Lr(B) (?$,1(s(B . ?,Ls(B) (?$,1(t(B . ?,Lt(B) 
(?$,1(u(B . ?,Lu(B) (?$,1(v(B . ?,Lv(B)
+    (?$,1(w(B . ?,Lw(B) (?$,1(x(B . ?,Lx(B) (?$,1(y(B . ?,Ly(B) 
(?$,1(z(B . ?,Lz(B) (?$,1({(B . ?,L{(B)
+    (?$,1(|(B . ?,L|(B) (?$,1(~(B . ?,L~(B) (?$,1((B . ?,L(B)))
+ 
+ 
+ (defcustom utf-fragment-on-decoding nil
+   "Whether or not to decode some chars in UTF-8/16 text into iso8859 charsets.
+ Setting this means that the relevant Cyrillic and Greek characters are
+ decoded into the iso8859 charsets rather than into
+ mule-unicode-0100-24ff.  The iso8859 charsets take half as much space
+ in the buffer, but using them may affect how the buffer can be re-encoded
+ and may require a different input method to search for them, for instance.
+ See `unify-8859-on-decoding-mode' and `unify-8859-on-encoding-mode'
+ for mechanisms to make this largely transparent.
+ 
+ Setting this variable outside customize has no effect."
+   :set (lambda (s v)
+        (if v
+            (progn
+              (define-translation-table 'utf-translation-table-for-decode
+                utf-fragmentation-table)
+              ;; Even if unify-8859-on-encoding-mode is off, make
+              ;; mule-utf-* encode characters in
+              ;; utf-fragmentation-table.
+              (unless (eq (get 'utf-translation-table-for-encode
+                               'translation-table)
+                          ucs-mule-to-mule-unicode)
+                (define-translation-table 'utf-translation-table-for-encode
+                  utf-defragmentation-table)))
+          (define-translation-table 'utf-translation-table-for-decode)
+          ;; When unify-8859-on-encoding-mode is off, be sure to make
+          ;; mule-utf-* disabled for characters in
+          ;; utf-fragmentation-table.
+          (unless (eq (get 'utf-translation-table-for-encode
+                           'translation-table)
+                      ucs-mule-to-mule-unicode)
+            (define-translation-table 'utf-translation-table-for-encode)))
+        (set-default s v))
+   :version "21.4"
+   :type 'boolean
+   :group 'mule)
+ 
+ 
+ (defconst utf-translate-cjk-charsets '(chinese-gb2312
+                                      chinese-big5-1 chinese-big5-2
+                                      japanese-jisx0208 japanese-jisx0212
+                                      korean-ksc5601)
+   "List of charsets supported by `utf-translate-cjk-mode'.")
+ 
+ (defconst utf-translate-cjk-unicode-range
+   '((#x2e80 . #xd7a3)
+     (#xff00 . #xffef))
+   "List of Unicode code ranges supported by `utf-translate-cjk-mode'.")
+ 
+ ;; Return non-nil if CODE-POINT is in `utf-translate-cjk-unicode-range'.
+ (defsubst utf-translate-cjk-substitutable-p (code-point)
+   (let ((tail utf-translate-cjk-unicode-range)
+       elt)
+     (while tail
+       (setq elt (car tail) tail (cdr tail))
+       (if (and (>= code-point (car elt)) (<= code-point (cdr elt)))
+         (setq tail nil)
+       (setq elt nil)))
+     elt))
+ 
+ (defvar utf-translate-cjk-lang-env nil
+   "Language environment in which tables for `utf-translate-cjk-mode' is 
loaded.
+ The value nil means that the tables are not yet loaded.")
+ 
+ (defun utf-translate-cjk-load-tables ()
+   "Load tables for `utf-translate-cjk-mode'."
+   ;; Fixme: Allow the use of the CJK charsets to be
+   ;; customized by reordering and possible omission.
+   (let ((redefined (< (hash-table-size ucs-mule-cjk-to-unicode) 43000)))
+     (if redefined
+       ;; Redefine them with realistic initial sizes and a
+       ;; smallish rehash size to avoid wasting significant
+       ;; space after they're built.
+       (setq ucs-mule-cjk-to-unicode
+             (make-hash-table :test 'eq :size 43000 :rehash-size 1000)
+             ucs-unicode-to-mule-cjk
+             (make-hash-table :test 'eq :size 21500 :rehash-size 1000)))
+ 
+     ;; Load the files explicitly, to avoid having to keep
+     ;; around the large tables they contain (as well as the
+     ;; ones which get built).
+     (cond ((string= "Korean" current-language-environment)
+          (load "subst-jis")
+          (load "subst-big5")
+          (load "subst-gb2312")
+          (load "subst-ksc"))
+         ((string= "Chinese-BIG5" current-language-environment)
+          (load "subst-jis")
+          (load "subst-ksc")
+          (load "subst-gb2312")
+          (load "subst-big5"))
+         ((string= "Chinese-GB" current-language-environment)
+          (load "subst-jis")
+          (load "subst-ksc")
+          (load "subst-big5")
+          (load "subst-gb2312"))
+         (t
+          (load "subst-ksc")
+          (load "subst-gb2312")
+          (load "subst-big5")
+          (load "subst-jis")))   ; jis covers as much as big5, gb2312
+ 
+     (when redefined
+       (define-translation-hash-table 'utf-subst-table-for-decode
+       ucs-unicode-to-mule-cjk)
+       (define-translation-hash-table 'utf-subst-table-for-encode
+       ucs-mule-cjk-to-unicode)
+       (set-char-table-extra-slot (get 'utf-translation-table-for-encode
+                                     'translation-table)
+                                1 ucs-mule-cjk-to-unicode))
+ 
+     (setq utf-translate-cjk-lang-env current-language-environment)))
+ 
+ (defun utf-lookup-subst-table-for-decode (code-point)
+   (if (and utf-translate-cjk-mode
+          (not utf-translate-cjk-lang-env)
+          (utf-translate-cjk-substitutable-p code-point))
+       (utf-translate-cjk-load-tables))
+   (gethash code-point
+          (get 'utf-subst-table-for-decode 'translation-hash-table)))
+ 
+ 
+ (defun utf-lookup-subst-table-for-encode (char)
+   (if (and utf-translate-cjk-mode
+          (not utf-translate-cjk-lang-env)
+          (memq (char-charset char) utf-translate-cjk-charsets))
+       (utf-translate-cjk-load-tables))
+   (gethash char
+          (get 'utf-subst-table-for-encode 'translation-hash-table)))
+ 
+ (define-minor-mode utf-translate-cjk-mode
+   "Toggle whether UTF based coding systems de/encode CJK characters.
+ If ARG is an integer, enable if ARG is positive and disable if
+ zero or negative.  This is a minor mode.
+ Enabling this allows the coding systems mule-utf-8,
+ mule-utf-16le and mule-utf-16be to encode characters in the charsets
+ `korean-ksc5601', `chinese-gb2312', `chinese-big5-1',
+ `chinese-big5-2', `japanese-jisx0208' and `japanese-jisx0212', and to
+ decode the corresponding unicodes into such characters.
+ 
+ Where the charsets overlap, the one preferred for decoding is chosen
+ according to the language environment in effect when this option is
+ turned on: ksc5601 for Korean, gb2312 for Chinese-GB, big5 for
+ Chinese-Big5 and jisx for other environments.
+ 
+ This mode is on by default.  If you are not interested in CJK
+ characters and want to avoid some overhead on encoding/decoding
+ by the above coding systems, you can customize the user option
+ `utf-translate-cjk-mode' to nil."
+   :init-value t
+   :version "21.4"
+   :type 'boolean
+   :group 'mule
+   :global t
+   (if utf-translate-cjk-mode
+       (progn
+       (define-translation-hash-table 'utf-subst-table-for-decode
+         ucs-unicode-to-mule-cjk)
+       (define-translation-hash-table 'utf-subst-table-for-encode
+         ucs-mule-cjk-to-unicode)
+       (set-char-table-extra-slot (get 'utf-translation-table-for-encode
+                                       'translation-table)
+                                  1 ucs-mule-cjk-to-unicode))
+     (define-translation-hash-table 'utf-subst-table-for-decode
+       (make-hash-table :test 'eq))
+     (define-translation-hash-table 'utf-subst-table-for-encode
+       (make-hash-table :test 'eq))
+     (set-char-table-extra-slot (get 'utf-translation-table-for-encode
+                                   'translation-table)
+                              1 nil))
+ 
+   ;; Update safe-chars of mule-utf-* coding systems.
+   (dolist (elt (coding-system-list t))
+     (if (string-match "^mule-utf" (symbol-name elt))
+       (let ((safe-charsets (coding-system-get elt 'safe-charsets))
+             (safe-chars (coding-system-get elt 'safe-chars))
+             (need-update nil))
+         (dolist (charset utf-translate-cjk-charsets)
+           (unless (eq utf-translate-cjk-mode (memq charset safe-charsets))
+             (setq safe-charsets
+                   (if utf-translate-cjk-mode
+                       (cons charset safe-charsets)
+                     (delq charset safe-charsets))
+                   need-update t)
+             (aset safe-chars (make-char charset) utf-translate-cjk-mode)))
+         (when need-update
+           (coding-system-put elt 'safe-charsets safe-charsets)
+           (define-coding-system-internal elt))))))
+ 
+ (define-ccl-program ccl-mule-utf-untrans
+   ;; R0 is an untranslatable Unicode code-point (U+3500..U+DFFF or
+   ;; U+10000..U+10FFFF) or an invaid byte (#x00..#xFF).  Write
+   ;; eight-bit-control/graphic sequence (2 to 4 chars) representing
+   ;; UTF-8 sequence of r0.  Registers r4, r5, r6 are modified.
+   ;;
+   ;; This is a subrountine because we assume that this is called very
+   ;; rarely (so we don't have to worry about the overhead of the
+   ;; call).
+   `(0
+     ((r5 = ,(charset-id 'eight-bit-control))
+      (r6 = ,(charset-id 'eight-bit-graphic))
+      (if (r0 < #x100)
+        ((r4 = ((r0 >> 6) | #xC0))
+         (write-multibyte-character r6 r4))
+        ((if (r0 < #x10000)
+           ((r4 = ((r0 >> 12) | #xE0))
+            (write-multibyte-character r6 r4))
+         ((r4 = ((r0 >> 18) | #xF0))
+          (write-multibyte-character r6 r4)
+          (r4 = (((r0 >> 12) & #x3F) | #x80))
+          (if (r4 < #xA0)
+              (write-multibyte-character r5 r4)
+            (write-multibyte-character r6 r4))))
+       (r4 = (((r0 >> 6) & #x3F) | #x80))
+       (if (r4 < #xA0)
+           (write-multibyte-character r5 r4)
+         (write-multibyte-character r6 r4))))
+      (r4 = ((r0 & #x3F) | #x80))
+      (if (r4 < #xA0)
+        (write-multibyte-character r5 r4)
+        (write-multibyte-character r6 r4)))))
+ 
+ (define-ccl-program ccl-decode-mule-utf-8
+   ;;
+   ;;        charset         | bytes in utf-8 | bytes in emacs
+   ;; -----------------------+----------------+---------------
+   ;;         ascii          |       1        |       1
+   ;; -----------------------+----------------+---------------
+   ;;    eight-bit-control   |       2        |       2
+   ;;    eight-bit-graphic   |       2        |       1
+   ;;     latin-iso8859-1    |       2        |       2
+   ;; -----------------------+----------------+---------------
+   ;; mule-unicode-0100-24ff |       2        |       4
+   ;;        (< 0800)        |                |
+   ;; -----------------------+----------------+---------------
+   ;; mule-unicode-0100-24ff |       3        |       4
+   ;;        (>= 8000)       |                |
+   ;; mule-unicode-2500-33ff |       3        |       4
+   ;; mule-unicode-e000-ffff |       3        |       4
+   ;; -----------------------+----------------+---------------
+   ;;      invalid byte      |       1        |       2
+   ;;
+   ;; Thus magnification factor is two.
+   ;;
+   `(2
+     ((r6 = ,(charset-id 'latin-iso8859-1))
+      (read r0)
+      (loop
+       (if (r0 < #x80)
+         ;; 1-byte encoding, i.e., ascii
+         (write-read-repeat r0))
+       (if (r0 < #xc2)
+         ;; continuation byte (invalid here) or 1st byte of overlong
+         ;; 2-byte sequence.
+         ((call ccl-mule-utf-untrans)
+          (r6 = ,(charset-id 'latin-iso8859-1))
+          (read r0)
+          (repeat)))
+ 
+       ;; Read the 2nd byte.
+       (read r1)
+       (if ((r1 & #b11000000) != #b10000000) ; Invalid 2nd byte
+         ((call ccl-mule-utf-untrans)
+          (r6 = ,(charset-id 'latin-iso8859-1))
+          ;; Handle it in the next loop.
+          (r0 = r1)
+          (repeat)))
+ 
+       (if (r0 < #xe0)
+         ;; 2-byte encoding 00000yyyyyxxxxxx = 110yyyyy 10xxxxxx
+         ((r1 &= #x3F)
+          (r1 |= ((r0 & #x1F) << 6))
+          ;; Now r2 holds scalar value.  We don't have to check
+          ;; `overlong sequence' because r0 >= 0xC2.
+ 
+          (if (r1 >= 256)
+              ;; mule-unicode-0100-24ff (< 0800)
+              ((r0 = ,(charset-id 'mule-unicode-0100-24ff))
+               (r1 -= #x0100)
+               (r2 = (((r1 / 96) + 32) << 7))
+               (r1 %= 96)
+               (r1 += (r2 + 32))
+               (translate-character
+                utf-translation-table-for-decode r0 r1)
+               (write-multibyte-character r0 r1)
+               (read r0)
+               (repeat))
+            (if (r1 >= 160)
+                ;; latin-iso8859-1
+                ((r1 -= 128)
+                 (write-multibyte-character r6 r1)
+                 (read r0)
+                 (repeat))
+              ;; eight-bit-control
+              ((r0 = ,(charset-id 'eight-bit-control))
+               (write-multibyte-character r0 r1)
+               (read r0)
+               (repeat))))))
+ 
+       ;; Read the 3rd bytes.
+       (read r2)
+       (if ((r2 & #b11000000) != #b10000000) ; Invalid 3rd byte
+         ((call ccl-mule-utf-untrans)
+          (r0 = r1)
+          (call ccl-mule-utf-untrans)
+          (r6 = ,(charset-id 'latin-iso8859-1))
+          ;; Handle it in the next loop.
+          (r0 = r2)
+          (repeat)))
+ 
+       (if (r0 < #xF0)
+         ;; 3byte encoding
+         ;; zzzzyyyyyyxxxxxx = 1110zzzz 10yyyyyy 10xxxxxx
+         ((r3 = ((r0 & #xF) << 12))
+          (r3 |= ((r1 & #x3F) << 6))
+          (r3 |= (r2 & #x3F))
+ 
+          (if (r3 < #x800)             ; `overlong sequence'
+              ((call ccl-mule-utf-untrans)
+               (r0 = r1)
+               (call ccl-mule-utf-untrans)
+               (r0 = r2)
+               (call ccl-mule-utf-untrans)
+               (r6 = ,(charset-id 'latin-iso8859-1))
+               (read r0)
+               (repeat)))
+ 
+          (if (r3 < #x2500)
+              ;; mule-unicode-0100-24ff (>= 0800)
+              ((r0 = ,(charset-id 'mule-unicode-0100-24ff))
+               (r3 -= #x0100)
+               (r3 //= 96)
+               (r1 = (r7 + 32))
+               (r1 += ((r3 + 32) << 7))
+               (translate-character
+                utf-translation-table-for-decode r0 r1)
+               (write-multibyte-character r0 r1)
+               (read r0)
+               (repeat)))
+ 
+          (if (r3 < #x3400)
+              ;; mule-unicode-2500-33ff
+              ((r0 = r3)               ; don't zap r3
+               (lookup-integer utf-subst-table-for-decode r0 r1)
+               (if (r7 == 0)
+                   ((r0 = ,(charset-id 'mule-unicode-2500-33ff))
+                    (r3 -= #x2500)
+                    (r3 //= 96)
+                    (r1 = (r7 + 32))
+                    (r1 += ((r3 + 32) << 7))))
+               (write-multibyte-character r0 r1)
+               (read r0)
+               (repeat)))
+ 
+          (if (r3 < #xE000)
+              ;; Try to convert to CJK chars, else
+              ;; keep them as eight-bit-{control|graphic}.
+              ((r0 = r3)
+               (lookup-integer utf-subst-table-for-decode r3 r1)
+               (if r7
+                   ;; got a translation
+                   ((write-multibyte-character r3 r1)
+                    (read r0)
+                    (repeat))
+                 ((call ccl-mule-utf-untrans)
+                  (r6 = ,(charset-id 'latin-iso8859-1))
+                  (read r0)
+                  (repeat)))))
+ 
+          ;; mule-unicode-e000-ffff
+          ;; Fixme: fffe and ffff are invalid.
+          (r0 = r3)            ; don't zap r3
+          (lookup-integer utf-subst-table-for-decode r0 r1)
+          (if (r7 == 0)
+              ((r0 = ,(charset-id 'mule-unicode-e000-ffff))
+               (r3 -= #xe000)
+               (r3 //= 96)
+               (r1 = (r7 + 32))
+               (r1 += ((r3 + 32) << 7))))
+          (write-multibyte-character r0 r1)
+          (read r0)
+          (repeat)))
+ 
+       ;; Read the 4th bytes.
+       (read r3)
+       (if ((r3 & #b11000000) != #b10000000) ; Invalid 4th byte
+         ((call ccl-mule-utf-untrans)
+          (r0 = r1)
+          (call ccl-mule-utf-untrans)
+          (r0 = r2)
+          (call ccl-mule-utf-untrans)
+          (r6 = ,(charset-id 'latin-iso8859-1))
+          ;; Handle it in the next loop.
+          (r0 = r3)
+          (repeat)))
+ 
+       (if (r0 < #xF8)
+         ;; 4-byte encoding:
+         ;; wwwzzzzzzyyyyyyxxxxxx = 11110www 10zzzzzz 10yyyyyy 10xxxxxx
+         ;; keep those bytes as eight-bit-{control|graphic}
+         ;; Fixme: allow lookup in utf-subst-table-for-decode.
+         ((r4 = ((r0 & #x7) << 18))
+          (r4 |= ((r1 & #x3F) << 12))
+          (r4 |= ((r2 & #x3F) << 6))
+          (r4 |= (r3 & #x3F))
+ 
+          (if (r4 < #x10000)           ; `overlong sequence'
+              ((call ccl-mule-utf-untrans)
+               (r0 = r1)
+               (call ccl-mule-utf-untrans)
+               (r0 = r2)
+               (call ccl-mule-utf-untrans)
+               (r0 = r3)
+               (call ccl-mule-utf-untrans))
+            ((r0 = r4)
+             (call ccl-mule-utf-untrans))))
+ 
+       ;; Unsupported sequence.
+       ((call ccl-mule-utf-untrans)
+        (r0 = r1)
+        (call ccl-mule-utf-untrans)
+        (r0 = r2)
+        (call ccl-mule-utf-untrans)
+        (r0 = r3)
+        (call ccl-mule-utf-untrans)))
+       (r6 = ,(charset-id 'latin-iso8859-1))
+       (read r0)
+       (repeat)))
+ 
+ 
+     ;; At EOF...
+     (if (r0 >= 0)
+       ;; r0 >= #x80
+       ((call ccl-mule-utf-untrans)
+        (if (r1 >= 0)
+            ((r0 = r1)
+             (call ccl-mule-utf-untrans)
+             (if (r2 >= 0)
+                 ((r0 = r2)
+                  (call ccl-mule-utf-untrans)
+                  (if (r3 >= 0)
+                      ((r0 = r3)
+                       (call ccl-mule-utf-untrans))))))))))
+ 
+   "CCL program to decode UTF-8.
+ Basic decoding is done into the charsets ascii, latin-iso8859-1 and
+ mule-unicode-*, but see also `utf-fragmentation-table' and
+ `ucs-mule-cjk-to-unicode'.
+ Encodings of un-representable Unicode characters are decoded asis into
+ eight-bit-control and eight-bit-graphic characters.")
+ 
+ (define-ccl-program ccl-mule-utf-8-encode-untrans
+   ;; UTF-8 decoder generates an UTF-8 sequence represented by a
+   ;; sequence eight-bit-control/graphic chars for an untranslatable
+   ;; character and an invalid byte.
+   ;;
+   ;; This CCL parses that sequence (the first byte is already in r1),
+   ;; writes out the original bytes of that sequence, and sets r5 to
+   ;; -1.
+   ;;
+   ;; If the eight-bit-control/graphic sequence is shorter than what r1
+   ;; suggests, it sets r5 and r6 to the last character read that
+   ;; should be handled by the next loop of a caller.
+   ;;
+   ;; Note: For UTF-8 validation, we only check if a character is
+   ;; eight-bit-control/graphic or not.  It may result in incorrect
+   ;; handling of random binary data, but such a data can't be encoded
+   ;; by UTF-8 anyway.  At least, UTF-8 decoders doesn't generate such
+   ;; a sequence even if a source contains invalid byte-sequence.
+   `(0
+     (;; Read the 2nd byte.
+      (read-multibyte-character r5 r6)
+      (r0 = (r5 != ,(charset-id 'eight-bit-control)))
+      (if ((r5 != ,(charset-id 'eight-bit-graphic)) & r0)
+        ((write r1)                    ; invalid UTF-8
+         (r1 = -1)
+         (end)))
+ 
+      (if (r1 <= #xC3)
+        ;; 2-byte sequence for an originally invalid byte.
+        ((r6 &= #x3F)
+         (r6 |= ((r1 & #x1F) << 6))
+         (write r6)
+         (r5 = -1)
+         (end)))
+ 
+      (write r1 r6)
+      (r2 = r1)
+      (r1 = -1)
+      ;; Read the 3rd byte.
+      (read-multibyte-character r5 r6)
+      (r0 = (r5 != ,(charset-id 'eight-bit-control)))
+      (if ((r5 != ,(charset-id 'eight-bit-graphic)) & r0)
+        (end))                         ; invalid UTF-8
+      (write r6)
+      (if (r2 < #xF0)
+        ;; 3-byte sequence for an untranslated character.
+        ((r5 = -1)
+         (end)))
+      ;; Read the 4th byte.
+      (read-multibyte-character r5 r6)
+      (r0 = (r5 != ,(charset-id 'eight-bit-control)))
+      (if ((r5 != ,(charset-id 'eight-bit-graphic)) & r0)
+        (end))                 ; invalid UTF-8
+      ;; 4-byte sequence for an untranslated character.
+      (write r6)
+      (r5 = -1)
+      (end))
+ 
+     ;; At EOF...
+     ((r5 = -1)
+      (if (r1 >= 0)
+        (write r1)))))
+ 
+ (define-ccl-program ccl-encode-mule-utf-8
+   `(1
+     ((r5 = -1)
+      (loop
+       (if (r5 < 0)
+         (read-multibyte-character r0 r1)
+       ;; Pre-read character is in r5 (charset-ID) and r6 (code-point).
+       ((r0 = r5)
+        (r1 = r6)
+        (r5 = -1)))
+       (translate-character utf-translation-table-for-encode r0 r1)
+ 
+       (if (r0 == ,(charset-id 'ascii))
+         (write-repeat r1))
+ 
+       (if (r0 == ,(charset-id 'latin-iso8859-1))
+         ;; r1          scalar                  utf-8
+         ;;       0000 0yyy yyxx xxxx    110y yyyy 10xx xxxx
+         ;; 20    0000 0000 1010 0000    1100 0010 1010 0000
+         ;; 7f    0000 0000 1111 1111    1100 0011 1011 1111
+         ((write ((r1 >> 6) | #xc2))
+          (r1 &= #x3f)
+          (r1 |= #x80)
+          (write-repeat r1)))
+ 
+       (if (r0 == ,(charset-id 'mule-unicode-0100-24ff))
+         ((r0 = ((((r1 & #x3f80) >> 7) - 32) * 96))
+          ;; #x3f80 == (0011 1111 1000 0000)b
+          (r1 &= #x7f)
+          (r1 += (r0 + 224))           ; 240 == -32 + #x0100
+          ;; now r1 holds scalar value
+          (if (r1 < #x0800)
+              ;; 2byte encoding
+              ((write ((r1 >> 6) | #xC0))
+               (r1 &= #x3F)
+               (r1 |= #x80)
+               (write-repeat r1))
+            ;; 3byte encoding
+            ((write ((r1 >> 12) | #xE0))
+             (write  (((r1 & #x0FC0) >> 6) | #x80))
+             (r1 &= #x3F)
+             (r1 |= #x80)
+             (write-repeat r1)))))
+ 
+       (if (r0 == ,(charset-id 'mule-unicode-2500-33ff))
+         ((r0 = ((((r1 & #x3f80) >> 7) - 32) * 96))
+          (r1 &= #x7f)
+          (r1 += (r0 + 9440))          ; 9440 == -32 + #x2500
+          ;; now r1 holds scalar value
+          (write ((r1 >> 12) | #xE0))
+          (write  (((r1 & #x0FC0) >> 6) | #x80))
+          (r1 &= #x3F)
+          (r1 |= #x80)
+          (write-repeat r1)))
+ 
+       (if (r0 == ,(charset-id 'mule-unicode-e000-ffff))
+         ((r0 = ((((r1 & #x3f80) >> 7) - 32) * 96))
+          (r1 &= #x7f)
+          (r1 += (r0 + 57312))         ; 57312 == -32 + #xe000
+          ;; now r1 holds scalar value
+          (write ((r1 >> 12) | #xE0))
+          (write  (((r1 & #x0FC0) >> 6) | #x80))
+          (r1 &= #x3F)
+          (r1 |= #x80)
+          (write-repeat r1)))
+ 
+       (if (r0 == ,(charset-id 'eight-bit-control))
+         ;; r1          scalar                  utf-8
+         ;;       0000 0yyy yyxx xxxx    110y yyyy 10xx xxxx
+         ;; 80    0000 0000 1000 0000    1100 0010 1000 0000
+         ;; 9f    0000 0000 1001 1111    1100 0010 1001 1111
+         ((write #xC2)
+          (write-repeat r1)))
+ 
+       (if (r0 == ,(charset-id 'eight-bit-graphic))
+         ;; r1          scalar                  utf-8
+         ;;       0000 0yyy yyxx xxxx    110y yyyy 10xx xxxx
+         ;; a0    0000 0000 1010 0000    1100 0010 1010 0000
+         ;; ff    0000 0000 1111 1111    1101 1111 1011 1111
+         ((r0 = (r1 >= #xC0))
+          (r0 &= (r1 <= #xC3))
+          (r4 = (r1 >= #xE1))
+          (r4 &= (r1 <= #xF7))
+          (r0 |= r4)
+          (if r0
+              ((call ccl-mule-utf-8-encode-untrans)
+               (repeat))
+            (write-repeat r1))))
+ 
+       (lookup-character utf-subst-table-for-encode r0 r1)
+       (if r7          ; lookup succeeded
+         (if (r0 < #x800)
+             ;; 2byte encoding
+             ((write ((r0 >> 6) | #xC0))
+              (r0 = ((r0 & #x3F) | #x80))
+              (write-repeat r0))
+           ;; 3byte encoding
+           ((write ((r0 >> 12) | #xE0))
+            (write  (((r0 & #x0FC0) >> 6) | #x80))
+            (r0 = ((r0 & #x3F) | #x80))
+            (write-repeat r0))))
+ 
+       ;; Unsupported character.
+       ;; Output U+FFFD, which is `ef bf bd' in UTF-8.
+       (write #xef)
+       (write #xbf)
+       (write-repeat #xbd))))
+   "CCL program to encode into UTF-8.")
+ 
+ 
+ (define-ccl-program ccl-untranslated-to-ucs
+   `(0
+     (if (r1 == 0)
+       nil
+       (if (r0 <= #xC3)                        ; 2-byte encoding
+         ((r0 = ((r0 & #x3) << 6))
+          (r0 |= (r1 & #x3F))
+          (r1 = 2))
+       (if (r2 == 0)
+           (r1 = 0)
+         (if (r0 < #xF0)               ; 3-byte encoding, as above
+             ((r0 = ((r0 & #xF) << 12))
+              (r0 |= ((r1 & #x3F) << 6))
+              (r0 |= (r2 & #x3F))
+              (r1 = 3))
+           (if (r3 == 0)
+               (r1 = 0)
+             ((r0 = ((r0 & #x7) << 18))
+              (r0 |= ((r1 & #x3F) << 12))
+              (r0 |= ((r2 & #x3F) << 6))
+              (r0 |= (r3 & #x3F))
+              (r1 = 4))))))))
+   "Decode 2-, 3-, or 4-byte sequences in r0, r1, r2 [,r3] to unicodes in r0.
+ Set r1 to the byte length.  r0 == 0 for invalid sequence.")
+ 
+ (defvar utf-8-ccl-regs (make-vector 8 0))
+ 
+ (defsubst utf-8-untranslated-to-ucs ()
+   "Return the UCS code for an untranslated sequence of raw bytes t point.
+ Only for 3- or 4-byte sequences."
+   (aset utf-8-ccl-regs 0 (or (char-after) 0))
+   (aset utf-8-ccl-regs 1 (or (char-after (1+ (point))) 0))
+   (aset utf-8-ccl-regs 2 (or (char-after (+ 2 (point))) 0))
+   (aset utf-8-ccl-regs 3 (or (char-after (+ 3 (point))) 0))
+   (ccl-execute 'ccl-untranslated-to-ucs utf-8-ccl-regs))
+ 
+ (defun utf-8-help-echo (window object position)
+   (format "Untranslated Unicode U+%04X"
+         (get-char-property position 'untranslated-utf-8 object)))
+ 
+ ;; We compose the untranslatable sequences into a single character,
+ ;; and move point to the next character.
+ ;; This is infelicitous for editing, because there's currently no
+ ;; mechanism for treating compositions as atomic, but is OK for
+ ;; display.  They are composed to U+FFFD with help-echo which
+ ;; indicates the unicodes they represent.  This function GCs too much.
+ 
+ ;; If utf-translate-cjk-mode is non-nil, this function is called with
+ ;; HASH-TABLE which translates CJK characters into some of CJK
+ ;; charsets.
+ 
+ (defsubst utf-8-compose (hash-table)
+   "Put a suitable composition on an untranslatable sequence at point.
+ If HASH-TABLE is non-nil, try to translate CJK characters by it at first.
+ Move point to the end of the sequence."
+   (utf-8-untranslated-to-ucs)
+   (let ((l (aref utf-8-ccl-regs 1))
+       ch)
+     (if (> l 0)
+       (if (and hash-table
+                (setq ch (gethash (aref utf-8-ccl-regs 0)  hash-table)))
+           (progn
+             (insert ch)
+             (delete-region (point) (min (point-max) (+ l (point)))))
+         (setq ch (aref utf-8-ccl-regs 0))
+         (put-text-property (point) (min (point-max) (+ l (point)))
+                            'untranslated-utf-8 ch)
+         (put-text-property (point) (min (point-max) (+ l (point)))
+                            'help-echo 'utf-8-help-echo)
+         (if (= l 2)
+             (put-text-property (point) (min (point-max) (+ l (point)))
+                                'display (format "\\%03o" ch))
+           (compose-region (point) (+ l (point)) ?$,3u=(B))
+         (forward-char l))
+       (forward-char 1))))
+ 
+ (defcustom utf-8-compose-scripts nil
+   "*Non-nil means compose various scripts on decoding utf-8 text."
+   :group 'mule
+   :version "21.4"
+   :type 'boolean)
+ 
+ (defun utf-8-post-read-conversion (length)
+   "Compose untranslated utf-8 sequences into single characters.
+ If `utf-translate-cjk-mode' is non-nil, tries to translate CJK characters.
+ Also compose particular scripts if `utf-8-compose-scripts' is non-nil."
+   (save-excursion
+     (save-restriction
+       (narrow-to-region (point) (+ (point) length))
+       ;; Can't do eval-when-compile to insert a multibyte constant
+       ;; version of the string in the loop, since it's always loaded as
+       ;; unibyte from a byte-compiled file.
+       (let ((range (string-as-multibyte "^\xc0-\xc3\xe1-\xf7"))
+           (buffer-multibyte enable-multibyte-characters)
+           hash-table ch)
+       (set-buffer-multibyte t)
+       (when utf-translate-cjk-mode
+         (if (not utf-translate-cjk-lang-env)
+             ;; Check these characters:
+             ;;   "U+2e80-U+33ff", "U+ff00-U+ffef"
+             ;; We may have to translate them to CJK charsets.
+             (let ((range2 "$,address@hidden(B-$,2G$,3r`(B-$,3u/(B"))
+               (skip-chars-forward (concat range range2))
+               (unless (eobp)
+                 (utf-translate-cjk-load-tables)
+                 (setq range (concat range range2)))
+         (setq hash-table (get 'utf-subst-table-for-decode
+                               'translation-hash-table)))))
+       (while (and (skip-chars-forward range)
+                   (not (eobp)))
+         (setq ch (following-char))
+         (if (< ch 256)
+             (utf-8-compose hash-table)
+           (if (and hash-table
+                    (setq ch (gethash (encode-char ch 'ucs) hash-table)))
+               (progn
+                 (insert ch)
+                 (delete-char 1))
+             (forward-char 1))))
+       (or buffer-multibyte
+           (set-buffer-multibyte nil)))
+ 
+       (when (and utf-8-compose-scripts (> length 1))
+       ;; These currently have definitions which cover the relevant
+       ;; unicodes.  We could avoid loading thai-util &c by checking
+       ;; whether the region contains any characters with the appropriate
+       ;; categories.  There aren't yet Unicode-based rules for Tibetan.
+       (diacritic-compose-region (point-max) (point-min))
+       (thai-compose-region (point-max) (point-min))
+       (lao-compose-region (point-max) (point-min))
+       (devanagari-compose-region (point-max) (point-min))
+       (malayalam-compose-region (point-max) (point-min))
+       (tamil-compose-region (point-max) (point-min)))
+       (- (point-max) (point-min)))))
+ 
+ (defun utf-8-pre-write-conversion (beg end)
+   "Prepare for `utf-translate-cjk-mode' to encode text between BEG and END.
+ This is used as a post-read-conversion of utf-8 coding system."
+   (if (and utf-translate-cjk-mode
+          (not utf-translate-cjk-lang-env)
+          (save-excursion
+            (goto-char beg)
+            (re-search-forward "\\cc\\|\\cj\\|\\ch" end t)))
+       (utf-translate-cjk-load-tables))
+   nil)
+ 
+ (make-coding-system
+  'mule-utf-8 4 ?u
+  "UTF-8 encoding for Emacs-supported Unicode characters.
+ It supports Unicode characters of these ranges:
+     U+0000..U+33FF, U+E000..U+FFFF.
+ They correspond to these Emacs character sets:
+     ascii, latin-iso8859-1, mule-unicode-0100-24ff,
+     mule-unicode-2500-33ff, mule-unicode-e000-ffff
+ 
+ On decoding (e.g. reading a file), Unicode characters not in the above
+ ranges are decoded into sequences of eight-bit-control and
+ eight-bit-graphic characters to preserve their byte sequences.  The
+ byte sequence is preserved on i/o for valid utf-8, but not necessarily
+ for invalid utf-8.
+ 
+ On encoding (e.g. writing a file), Emacs characters not belonging to
+ any of the character sets listed above are encoded into the UTF-8 byte
+ sequence representing U+FFFD (REPLACEMENT CHARACTER)."
+ 
+  '(ccl-decode-mule-utf-8 . ccl-encode-mule-utf-8)
+  `((safe-charsets
+     ascii
+     eight-bit-control
+     eight-bit-graphic
+     latin-iso8859-1
+     mule-unicode-0100-24ff
+     mule-unicode-2500-33ff
+     mule-unicode-e000-ffff
+     ,@(if utf-translate-cjk-mode
+         utf-translate-cjk-charsets))
+    (mime-charset . utf-8)
+    (coding-category . coding-category-utf-8)
+    (valid-codes (0 . 255))
+    (pre-write-conversion . utf-8-pre-write-conversion)
+    (post-read-conversion . utf-8-post-read-conversion)
+    (translation-table-for-encode . utf-translation-table-for-encode)
+    (dependency unify-8859-on-encoding-mode
+              unify-8859-on-decoding-mode
+              utf-fragment-on-decoding
+              utf-translate-cjk-mode)))
+ 
+ (define-coding-system-alias 'utf-8 'mule-utf-8)
+ 
+ ;; I think this needs special private charsets defined for the
+ ;; untranslated sequences, if it's going to work well.
+ 
+ ;;; (defun utf-8-compose-function (pos to pattern &optional string)
+ ;;;   (let* ((prop (get-char-property pos 'composition string))
+ ;;;    (l (and prop (- (cadr prop) (car prop)))))
+ ;;;     (cond ((and l (> l (- to pos)))
+ ;;;      (delete-region pos to))
+ ;;;     ((and (> (char-after pos) 224)
+ ;;;           (< (char-after pos) 256)
+ ;;;           (save-restriction
+ ;;;             (narrow-to-region pos to)
+ ;;;             (utf-8-compose)))
+ ;;;      t))))
+ 
+ ;;; (dotimes (i 96)
+ ;;;   (aset composition-function-table
+ ;;;   (+ 128 i)
+ ;;;   `((,(string-as-multibyte "[\200-\237\240-\377]")
+ ;;;      . utf-8-compose-function))))
+ 
+ ;;; arch-tag: b08735b7-753b-4ae6-b754-0f3efe4515c5
+ ;;; utf-8.el ends here




reply via email to

[Prev in Thread] Current Thread [Next in Thread]