[Top][All Lists]
[Date Prev][Date Next][Thread Prev][Thread Next][Date Index][Thread Index]
[Emacs-diffs] Changes to emacs/lisp/international/utf-8.el
From: |
Kenichi Handa |
Subject: |
[Emacs-diffs] Changes to emacs/lisp/international/utf-8.el |
Date: |
Fri, 11 Jun 2004 22:11:58 -0400 |
Index: emacs/lisp/international/utf-8.el
diff -c emacs/lisp/international/utf-8.el:1.37
emacs/lisp/international/utf-8.el:1.38
*** emacs/lisp/international/utf-8.el:1.37 Sat May 8 14:35:00 2004
--- emacs/lisp/international/utf-8.el Sat Jun 12 02:10:37 2004
***************
*** 190,198 ****
:type 'boolean
:group 'mule)
(define-minor-mode utf-translate-cjk-mode
"Whether the UTF based coding systems should decode/encode CJK characters.
! Enabling this loads tables which allow the coding systems mule-utf-8,
mule-utf-16le and mule-utf-16be to encode characters in the charsets
`korean-ksc5601', `chinese-gb2312', `chinese-big5-1',
`chinese-big5-2', `japanese-jisx0208' and `japanese-jisx0212', and to
--- 190,291 ----
:type 'boolean
:group 'mule)
+
+ (defconst utf-translate-cjk-charsets '(chinese-gb2312
+ chinese-big5-1 chinese-big5-2
+ japanese-jisx0208 japanese-jisx0212
+ korean-ksc5601)
+ "List of charsets supported by `utf-translate-cjk-mode'.")
+
+ (defconst utf-translate-cjk-unicode-range
+ '((#x2e80 . #xd7a3)
+ (#xff00 . #xffef))
+ "List of Unicode code ranges supported by `utf-translate-cjk-mode'.")
+
+ ;; Return non-nil if CODE-POINT is in `utf-translate-cjk-unicode-range'.
+ (defsubst utf-translate-cjk-substitutable-p (code-point)
+ (let ((tail utf-translate-cjk-unicode-range)
+ elt)
+ (while tail
+ (setq elt (car tail) tail (cdr tail))
+ (if (and (>= code-point (car elt)) (<= code-point (cdr elt)))
+ (setq tail nil)
+ (setq elt nil)))
+ elt))
+
+ (defvar utf-translate-cjk-lang-env nil
+ "Language environment in which tables for `utf-translate-cjk-mode' is
loaded.
+ The value nil means that the tables are not yet loaded.")
+
+ (defun utf-translate-cjk-load-tables ()
+ "Load tables for `utf-translate-cjk-mode'."
+ ;; Fixme: Allow the use of the CJK charsets to be
+ ;; customized by reordering and possible omission.
+ (let ((redefined (< (hash-table-size ucs-mule-cjk-to-unicode) 43000)))
+ (if redefined
+ ;; Redefine them with realistic initial sizes and a
+ ;; smallish rehash size to avoid wasting significant
+ ;; space after they're built.
+ (setq ucs-mule-cjk-to-unicode
+ (make-hash-table :test 'eq :size 43000 :rehash-size 1000)
+ ucs-unicode-to-mule-cjk
+ (make-hash-table :test 'eq :size 21500 :rehash-size 1000)))
+
+ ;; Load the files explicitly, to avoid having to keep
+ ;; around the large tables they contain (as well as the
+ ;; ones which get built).
+ (cond ((string= "Korean" current-language-environment)
+ (load "subst-jis")
+ (load "subst-big5")
+ (load "subst-gb2312")
+ (load "subst-ksc"))
+ ((string= "Chinese-BIG5" current-language-environment)
+ (load "subst-jis")
+ (load "subst-ksc")
+ (load "subst-gb2312")
+ (load "subst-big5"))
+ ((string= "Chinese-GB" current-language-environment)
+ (load "subst-jis")
+ (load "subst-ksc")
+ (load "subst-big5")
+ (load "subst-gb2312"))
+ (t
+ (load "subst-ksc")
+ (load "subst-gb2312")
+ (load "subst-big5")
+ (load "subst-jis"))) ; jis covers as much as big5, gb2312
+
+ (when redefined
+ (define-translation-hash-table 'utf-subst-table-for-decode
+ ucs-unicode-to-mule-cjk)
+ (define-translation-hash-table 'utf-subst-table-for-encode
+ ucs-mule-cjk-to-unicode)
+ (set-char-table-extra-slot (get 'utf-translation-table-for-encode
+ 'translation-table)
+ 1 ucs-mule-cjk-to-unicode))
+
+ (setq utf-translate-cjk-lang-env current-language-environment)))
+
+ (defun utf-lookup-subst-table-for-decode (code-point)
+ (if (and utf-translate-cjk-mode
+ (not utf-translate-cjk-lang-env)
+ (utf-translate-cjk-substitutable-p code-point))
+ (utf-translate-cjk-load-tables))
+ (gethash code-point
+ (get 'utf-subst-table-for-decode 'translation-hash-table)))
+
+
+ (defun utf-lookup-subst-table-for-encode (char)
+ (if (and utf-translate-cjk-mode
+ (not utf-translate-cjk-lang-env)
+ (memq (char-charset char) utf-translate-cjk-charsets))
+ (utf-translate-cjk-load-tables))
+ (gethash char
+ (get 'utf-subst-table-for-encode 'translation-hash-table)))
+
(define-minor-mode utf-translate-cjk-mode
"Whether the UTF based coding systems should decode/encode CJK characters.
! Enabling this allows the coding systems mule-utf-8,
mule-utf-16le and mule-utf-16be to encode characters in the charsets
`korean-ksc5601', `chinese-gb2312', `chinese-big5-1',
`chinese-big5-2', `japanese-jisx0208' and `japanese-jisx0212', and to
***************
*** 203,251 ****
turned on: ksc5601 for Korean, gb2312 for Chinese-GB, big5 for
Chinese-Big5 and jisx for other environments.
! The tables are large (over 40000 entries), so this option is not the
! default. Also, installing them may be rather slow."
! :init-value nil
:version "21.4"
:type 'boolean
- :set-after '(current-language-environment)
:group 'mule
:global t
(if utf-translate-cjk-mode
- ;; Fixme: Allow the use of the CJK charsets to be
- ;; customized by reordering and possible omission.
(progn
- ;; Redefine them with realistic initial sizes and a
- ;; smallish rehash size to avoid wasting significant
- ;; space after they're built.
- (setq ucs-mule-cjk-to-unicode
- (make-hash-table :test 'eq :size 43000 :rehash-size 1000)
- ucs-unicode-to-mule-cjk
- (make-hash-table :test 'eq :size 21500 :rehash-size 1000))
- ;; Load the files explicitly, to avoid having to keep
- ;; around the large tables they contain (as well as the
- ;; ones which get built).
- (cond
- ((string= "Korean" current-language-environment)
- (load "subst-jis")
- (load "subst-big5")
- (load "subst-gb2312")
- (load "subst-ksc"))
- ((string= "Chinese-BIG5" current-language-environment)
- (load "subst-jis")
- (load "subst-ksc")
- (load "subst-gb2312")
- (load "subst-big5"))
- ((string= "Chinese-GB" current-language-environment)
- (load "subst-jis")
- (load "subst-ksc")
- (load "subst-big5")
- (load "subst-gb2312"))
- (t
- (load "subst-ksc")
- (load "subst-gb2312")
- (load "subst-big5")
- (load "subst-jis"))) ; jis covers as much as big5, gb2312
(define-translation-hash-table 'utf-subst-table-for-decode
ucs-unicode-to-mule-cjk)
(define-translation-hash-table 'utf-subst-table-for-encode
--- 296,311 ----
turned on: ksc5601 for Korean, gb2312 for Chinese-GB, big5 for
Chinese-Big5 and jisx for other environments.
! This option is on by default. If you are not interested in CJK
! characters and want to avoid some overhead on encoding/decoding
! by the above coding systems, you can customize this option to nil."
! :init-value t
:version "21.4"
:type 'boolean
:group 'mule
:global t
(if utf-translate-cjk-mode
(progn
(define-translation-hash-table 'utf-subst-table-for-decode
ucs-unicode-to-mule-cjk)
(define-translation-hash-table 'utf-subst-table-for-encode
***************
*** 259,265 ****
(make-hash-table :test 'eq))
(set-char-table-extra-slot (get 'utf-translation-table-for-encode
'translation-table)
! 1 nil)))
(define-ccl-program ccl-decode-mule-utf-8
;;
--- 319,376 ----
(make-hash-table :test 'eq))
(set-char-table-extra-slot (get 'utf-translation-table-for-encode
'translation-table)
! 1 nil))
!
! ;; Update safe-chars of mule-utf-* coding systems.
! (dolist (elt (coding-system-list t))
! (if (string-match "^mule-utf" (symbol-name elt))
! (let ((safe-charsets (coding-system-get elt 'safe-charsets))
! (safe-chars (coding-system-get elt 'safe-chars))
! (need-update nil))
! (dolist (charset utf-translate-cjk-charsets)
! (unless (eq utf-translate-cjk-mode (memq charset safe-charsets))
! (setq safe-charsets
! (if utf-translate-cjk-mode
! (cons charset safe-charsets)
! (delq charset safe-charsets))
! need-update t)
! (aset safe-chars (make-char charset) utf-translate-cjk-mode)))
! (when need-update
! (coding-system-put elt 'safe-charsets safe-charsets)
! (define-coding-system-internal elt))))))
!
! (define-ccl-program ccl-mule-utf-untrans
! ;; R0 is an untranslatable Unicode code-point (U+3500..U+DFFF or
! ;; U+10000..U+10FFFF) or an invaid byte (#x00..#xFF). Write
! ;; eight-bit-control/graphic sequence (2 to 4 chars) representing
! ;; UTF-8 sequence of r0. Registers r4, r5, r6 are modified.
! ;;
! ;; This is a subrountine because we assume that this is called very
! ;; rarely (so we don't have to worry about the overhead of the
! ;; call).
! `(0
! ((r5 = ,(charset-id 'eight-bit-control))
! (r6 = ,(charset-id 'eight-bit-graphic))
! (if (r0 < #x100)
! ((r4 = ((r0 >> 6) | #xC0))
! (write-multibyte-character r6 r4))
! ((if (r0 < #x10000)
! ((r4 = ((r0 >> 12) | #xE0))
! (write-multibyte-character r6 r4))
! ((r4 = ((r0 >> 18) | #xF0))
! (write-multibyte-character r6 r4)
! (r4 = (((r0 >> 12) & #x3F) | #x80))
! (if (r4 < #xA0)
! (write-multibyte-character r5 r4)
! (write-multibyte-character r6 r4))))
! (r4 = (((r0 >> 6) & #x3F) | #x80))
! (if (r4 < #xA0)
! (write-multibyte-character r5 r4)
! (write-multibyte-character r6 r4))))
! (r4 = ((r0 & #x3F) | #x80))
! (if (r4 < #xA0)
! (write-multibyte-character r5 r4)
! (write-multibyte-character r6 r4)))))
(define-ccl-program ccl-decode-mule-utf-8
;;
***************
*** 278,537 ****
;; (>= 8000) | |
;; mule-unicode-2500-33ff | 3 | 4
;; mule-unicode-e000-ffff | 3 | 4
;;
;; Thus magnification factor is two.
;;
`(2
! ((r5 = ,(charset-id 'eight-bit-control))
! (r6 = ,(charset-id 'eight-bit-graphic))
(loop
! (r0 = -1)
! (read r0)
!
! ;; 1byte encoding, i.e., ascii
(if (r0 < #x80)
! ((write r0))
! (if (r0 < #xc0) ; continuation byte (invalid here)
! ((if (r0 < #xa0)
! (write-multibyte-character r5 r0)
! (write-multibyte-character r6 r0)))
! ;; 2 byte encoding 00000yyyyyxxxxxx = 110yyyyy 10xxxxxx
! (if (r0 < #xe0)
! ((r1 = -1)
! (read r1)
!
! (if ((r1 & #b11000000) != #b10000000)
! ;; Invalid 2-byte sequence
! ((if (r0 < #xa0)
! (write-multibyte-character r5 r0)
! (write-multibyte-character r6 r0))
! (if (r1 < #x80)
! (write r1)
! (if (r1 < #xa0)
! (write-multibyte-character r5 r1)
! (write-multibyte-character r6 r1))))
!
! ((r3 = r0) ; save in case of overlong sequence
! (r2 = r1)
! (r0 &= #x1f)
! (r0 <<= 6)
! (r1 &= #x3f)
! (r1 += r0)
! ;; Now r1 holds scalar value
!
! (if (r1 < 128) ; `overlong sequence'
! ((if (r3 < #xa0)
! (write-multibyte-character r5 r3)
! (write-multibyte-character r6 r3))
! (if (r2 < #x80)
! (write r2)
! (if (r2 < #xa0)
! (write-multibyte-character r5 r2)
! (write-multibyte-character r6 r2))))
!
! ;; eight-bit-control
! (if (r1 < 160)
! ((write-multibyte-character r5 r1))
!
! ;; latin-iso8859-1
! (if (r1 < 256)
! ((r0 = ,(charset-id 'latin-iso8859-1))
! (r1 -= 128)
! (write-multibyte-character r0 r1))
!
! ;; mule-unicode-0100-24ff (< 0800)
! ((r0 = ,(charset-id 'mule-unicode-0100-24ff))
! (r1 -= #x0100)
! (r2 = (((r1 / 96) + 32) << 7))
! (r1 %= 96)
! (r1 += (r2 + 32))
! (translate-character
! utf-translation-table-for-decode r0 r1)
! (write-multibyte-character r0 r1))))))))
!
! ;; 3byte encoding
! ;; zzzzyyyyyyxxxxxx = 1110zzzz 10yyyyyy 10xxxxxx
! (if (r0 < #xf0)
! ((r1 = -1)
! (r2 = -1)
! (read r1 r2)
!
! ;; This is set to 1 if the encoding is invalid.
! (r4 = 0)
!
! (r3 = (r1 & #b11000000))
! (r3 |= ((r2 >> 2) & #b00110000))
! (if (r3 != #b10100000)
! (r4 = 1)
! ((r3 = ((r0 & #x0f) << 12))
! (r3 += ((r1 & #x3f) << 6))
! (r3 += (r2 & #x3f))
! (if (r3 < #x0800)
! (r4 = 1))))
!
! (if (r4 != 0)
! ;; Invalid 3-byte sequence
! ((if (r0 < #xa0)
! (write-multibyte-character r5 r0)
! (write-multibyte-character r6 r0))
! (if (r1 < #x80)
! (write r1)
! (if (r1 < #xa0)
! (write-multibyte-character r5 r1)
! (write-multibyte-character r6 r1)))
! (if (r2 < #x80)
! (write r2)
! (if (r2 < #xa0)
! (write-multibyte-character r5 r2)
! (write-multibyte-character r6 r2))))
!
! ;; mule-unicode-0100-24ff (>= 0800)
! ((if (r3 < #x2500)
! ((r0 = ,(charset-id 'mule-unicode-0100-24ff))
! (r3 -= #x0100)
! (r3 //= 96)
! (r1 = (r7 + 32))
! (r1 += ((r3 + 32) << 7))
! (translate-character
! utf-translation-table-for-decode r0 r1)
! (write-multibyte-character r0 r1))
!
! ;; mule-unicode-2500-33ff
! (if (r3 < #x3400)
! ((r4 = r3) ; don't zap r3
! (lookup-integer utf-subst-table-for-decode r4 r5)
! (if r7
! ;; got a translation
! ((write-multibyte-character r4 r5)
! ;; Zapped through register starvation.
! (r5 = ,(charset-id 'eight-bit-control)))
! ((r0 = ,(charset-id 'mule-unicode-2500-33ff))
! (r3 -= #x2500)
! (r3 //= 96)
! (r1 = (r7 + 32))
! (r1 += ((r3 + 32) << 7))
! (write-multibyte-character r0 r1))))
!
! ;; U+3400 .. U+D7FF
! ;; Try to convert to CJK chars, else keep
! ;; them as eight-bit-{control|graphic}.
! (if (r3 < #xd800)
! ((r4 = r3) ; don't zap r3
! (lookup-integer utf-subst-table-for-decode r4 r5)
! (if r7
! ;; got a translation
! ((write-multibyte-character r4 r5)
! ;; Zapped through register starvation.
! (r5 = ,(charset-id 'eight-bit-control)))
! ;; #xe0 <= r0 < #xf0, so r0 is eight-bit-graphic
! ((r3 = r6)
! (write-multibyte-character r3 r0)
! (if (r1 < #xa0)
! (r3 = r5))
! (write-multibyte-character r3 r1)
! (if (r2 < #xa0)
! (r3 = r5)
! (r3 = r6))
! (write-multibyte-character r3 r2))))
!
! ;; Surrogates, U+D800 .. U+DFFF
! (if (r3 < #xe000)
! ((r3 = r6)
! (write-multibyte-character r3 r0) ;
eight-bit-graphic
! (if (r1 < #xa0)
! (r3 = r5))
! (write-multibyte-character r3 r1)
! (if (r2 < #xa0)
! (r3 = r5)
! (r3 = r6))
! (write-multibyte-character r3 r2))
!
! ;; mule-unicode-e000-ffff
! ;; Fixme: fffe and ffff are invalid.
! ((r4 = r3) ; don't zap r3
! (lookup-integer utf-subst-table-for-decode r4 r5)
! (if r7
! ;; got a translation
! ((write-multibyte-character r4 r5)
! ;; Zapped through register starvation.
! (r5 = ,(charset-id 'eight-bit-control)))
! ((r0 = ,(charset-id 'mule-unicode-e000-ffff))
! (r3 -= #xe000)
! (r3 //= 96)
! (r1 = (r7 + 32))
! (r1 += ((r3 + 32) << 7))
! (write-multibyte-character r0 r1)))))))))))
!
! (if (r0 < #xfe)
! ;; 4byte encoding
! ;; keep those bytes as eight-bit-{control|graphic}
! ;; Fixme: allow lookup in utf-subst-table-for-decode.
! ((r1 = -1)
! (r2 = -1)
! (r3 = -1)
! (read r1 r2 r3)
! ;; r0 > #xf0, thus eight-bit-graphic
! (write-multibyte-character r6 r0)
! (if (r1 < #xa0)
! (if (r1 < #x80) ; invalid byte
! (write r1)
! (write-multibyte-character r5 r1))
! (write-multibyte-character r6 r1))
! (if (r2 < #xa0)
! (if (r2 < #x80) ; invalid byte
! (write r2)
! (write-multibyte-character r5 r2))
! (write-multibyte-character r6 r2))
! (if (r3 < #xa0)
! (if (r3 < #x80) ; invalid byte
! (write r3)
! (write-multibyte-character r5 r3))
! (write-multibyte-character r6 r3))
! (if (r0 >= #xf8) ; 5- or 6-byte encoding
! ((r0 = -1)
! (read r0)
! (if (r0 < #xa0)
! (if (r0 < #x80) ; invalid byte
! (write r0)
! (write-multibyte-character r5 r0))
! (write-multibyte-character r6 r0))
! (if (r0 >= #xfc) ; 6-byte
! ((r0 = -1)
! (read r0)
! (if (r0 < #xa0)
! (if (r0 < #x80) ; invalid byte
! (write r0)
! (write-multibyte-character r5 r0))
! (write-multibyte-character r6 r0)))))))
! ;; else invalid byte >= #xfe
! (write-multibyte-character r6 r0))))))
(repeat)))
;; At EOF...
(if (r0 >= 0)
! ((if (r0 < #x80)
! (write r0)
! (if (r0 < #xa0)
! (write-multibyte-character r5 r0)
! ((write-multibyte-character r6 r0))))
(if (r1 >= 0)
! ((if (r1 < #x80)
! (write r1)
! (if (r1 < #xa0)
! (write-multibyte-character r5 r1)
! ((write-multibyte-character r6 r1))))
(if (r2 >= 0)
! ((if (r2 < #x80)
! (write r2)
! (if (r2 < #xa0)
! (write-multibyte-character r5 r2)
! ((write-multibyte-character r6 r2))))
(if (r3 >= 0)
! (if (r3 < #x80)
! (write r3)
! (if (r3 < #xa0)
! (write-multibyte-character r5 r3)
! ((write-multibyte-character r6 r3))))))))))))
"CCL program to decode UTF-8.
Basic decoding is done into the charsets ascii, latin-iso8859-1 and
--- 389,594 ----
;; (>= 8000) | |
;; mule-unicode-2500-33ff | 3 | 4
;; mule-unicode-e000-ffff | 3 | 4
+ ;; -----------------------+----------------+---------------
+ ;; invalid byte | 1 | 2
;;
;; Thus magnification factor is two.
;;
`(2
! ((r0 = -1)
(loop
! (if (r0 < 0)
! (read r0))
(if (r0 < #x80)
! ;; 1-byte encoding, i.e., ascii
! ((write r0)
! (r0 = -1)
! (repeat)))
! (if (r0 < #xc0) ; continuation byte (invalid here)
! ((call ccl-mule-utf-untrans)
! (r0 = -1)
! (repeat)))
!
! ;; Read the 2nd byte.
! (r1 = -1)
! (read r1)
! (if ((r1 & #b11000000) != #b10000000) ; Invalid 2nd byte
! ((call ccl-mule-utf-untrans)
! ;; Handle it in the next loop.
! (r0 = r1)
! (repeat)))
!
! (if (r0 < #xe0)
! ;; 2-byte encoding 00000yyyyyxxxxxx = 110yyyyy 10xxxxxx
! ((r2 = ((r0 & #x1F) << 6))
! (r2 |= (r1 & #x3F))
! ;; Now r2 holds scalar value
!
! (if (r2 < 128) ; `overlong sequence'
! ((call ccl-mule-utf-untrans)
! (r0 = r1)
! (call ccl-mule-utf-untrans)
! (r0 = -1)
! (repeat)))
!
! (r1 = r2)
! (if (r1 < 160)
! ;; eight-bit-control
! (r0 = ,(charset-id 'eight-bit-control))
! (if (r1 < 256)
! ;; latin-iso8859-1
! ((r0 = ,(charset-id 'latin-iso8859-1))
! (r1 -= 128))
! ;; mule-unicode-0100-24ff (< 0800)
! ((r0 = ,(charset-id 'mule-unicode-0100-24ff))
! (r1 -= #x0100)
! (r2 = (((r1 / 96) + 32) << 7))
! (r1 %= 96)
! (r1 += (r2 + 32))
! (translate-character
! utf-translation-table-for-decode r0 r1))))
! (write-multibyte-character r0 r1)
! (r0 = -1)
! (repeat)))
!
! ;; Read the 3rd bytes.
! (r2 = -1)
! (read r2)
! (if ((r2 & #b11000000) != #b10000000) ; Invalid 3rd byte
! ((call ccl-mule-utf-untrans)
! (r0 = r1)
! (call ccl-mule-utf-untrans)
! ;; Handle it in the next loop.
! (r0 = r2)
! (repeat)))
!
! (if (r0 < #xF0)
! ;; 3byte encoding
! ;; zzzzyyyyyyxxxxxx = 1110zzzz 10yyyyyy 10xxxxxx
! ((r3 = ((r0 & #xF) << 12))
! (r3 |= ((r1 & #x3F) << 6))
! (r3 |= (r2 & #x3F))
!
! (if (r3 < #x800) ; `overlong sequence'
! ((call ccl-mule-utf-untrans)
! (r0 = r1)
! (call ccl-mule-utf-untrans)
! (r0 = r2)
! (call ccl-mule-utf-untrans)
! (r0 = -1)
! (repeat)))
!
! (if (r3 < #x2500)
! ;; mule-unicode-0100-24ff (>= 0800)
! ((r0 = ,(charset-id 'mule-unicode-0100-24ff))
! (r3 -= #x0100)
! (r3 //= 96)
! (r1 = (r7 + 32))
! (r1 += ((r3 + 32) << 7))
! (translate-character
! utf-translation-table-for-decode r0 r1)
! (write-multibyte-character r0 r1)
! (r0 = -1)
! (repeat)))
!
! (if (r3 < #x3400)
! ;; mule-unicode-2500-33ff
! ((r0 = r3) ; don't zap r3
! (lookup-integer utf-subst-table-for-decode r0 r1)
! (if (r7 == 0)
! ((r0 = ,(charset-id 'mule-unicode-2500-33ff))
! (r3 -= #x2500)
! (r3 //= 96)
! (r1 = (r7 + 32))
! (r1 += ((r3 + 32) << 7))))
! (write-multibyte-character r0 r1)
! (r0 = -1)
! (repeat)))
!
! (if (r3 < #xE000)
! ;; Try to convert to CJK chars, else
! ;; keep them as eight-bit-{control|graphic}.
! ((r0 = r3)
! (lookup-integer utf-subst-table-for-decode r3 r1)
! (if r7
! ;; got a translation
! (write-multibyte-character r3 r1)
! (call ccl-mule-utf-untrans))
! (r0 = -1)
! (repeat)))
!
! ;; mule-unicode-e000-ffff
! ;; Fixme: fffe and ffff are invalid.
! (r0 = r3) ; don't zap r3
! (lookup-integer utf-subst-table-for-decode r0 r1)
! (if (r7 == 0)
! ((r0 = ,(charset-id 'mule-unicode-e000-ffff))
! (r3 -= #xe000)
! (r3 //= 96)
! (r1 = (r7 + 32))
! (r1 += ((r3 + 32) << 7))))
! (write-multibyte-character r0 r1)
! (r0 = -1)
! (repeat)))
!
! ;; Read the 4th bytes.
! (r3 = -1)
! (read r3)
! (if ((r3 & #b11000000) != #b10000000) ; Invalid 4th byte
! ((call ccl-mule-utf-untrans)
! (r0 = r1)
! (call ccl-mule-utf-untrans)
! ;; Handle it in the next loop.
! (r0 = r3)
! (repeat)))
!
! (if (r3 < #xF8)
! ;; 4-byte encoding:
! ;; wwwzzzzzzyyyyyyxxxxxx = 11110www 10zzzzzz 10yyyyyy 10xxxxxx
! ;; keep those bytes as eight-bit-{control|graphic}
! ;; Fixme: allow lookup in utf-subst-table-for-decode.
! ((r4 = ((r0 & #x7) << 18))
! (r4 |= ((r1 & #x3F) << 12))
! (r4 |= ((r2 & #x3F) << 6))
! (r4 |= (r3 & #x3F))
!
! (if (r4 < #x10000) ; `overlong sequence'
! ((call ccl-mule-utf-untrans)
! (r0 = r1)
! (call ccl-mule-utf-untrans)
! (r0 = r2)
! (call ccl-mule-utf-untrans)
! (r0 = r3)
! (call ccl-mule-utf-untrans))
! ((r0 = r4)
! (call ccl-mule-utf-untrans)))
! (r0 = -1)
! (repeat)))
!
! ;; Unsupported sequence.
! (call ccl-mule-utf-untrans)
! (r0 = r1)
! (call ccl-mule-utf-untrans)
! (r0 = r2)
! (call ccl-mule-utf-untrans)
! (r0 = r3)
! (call ccl-mule-utf-untrans)
! (r0 = -1)
(repeat)))
;; At EOF...
(if (r0 >= 0)
! ;; r0 >= #x80
! ((call ccl-mule-utf-untrans)
(if (r1 >= 0)
! ((r0 = r1)
! (call ccl-mule-utf-untrans)
(if (r2 >= 0)
! ((r0 = r2)
! (call ccl-mule-utf-untrans)
(if (r3 >= 0)
! ((r0 = r3)
! (call ccl-mule-utf-untrans))))))))))
"CCL program to decode UTF-8.
Basic decoding is done into the charsets ascii, latin-iso8859-1 and
***************
*** 540,703 ****
Encodings of un-representable Unicode characters are decoded asis into
eight-bit-control and eight-bit-graphic characters.")
(define-ccl-program ccl-encode-mule-utf-8
`(1
((r5 = -1)
(loop
(if (r5 < 0)
! ((r1 = -1)
! (read-multibyte-character r0 r1)
! (translate-character utf-translation-table-for-encode r0 r1))
! (;; We have already done read-multibyte-character.
! (r0 = r5)
(r1 = r6)
(r5 = -1)))
(if (r0 == ,(charset-id 'ascii))
! (write r1)
! (if (r0 == ,(charset-id 'latin-iso8859-1))
! ;; r1 scalar utf-8
! ;; 0000 0yyy yyxx xxxx 110y yyyy 10xx xxxx
! ;; 20 0000 0000 1010 0000 1100 0010 1010 0000
! ;; 7f 0000 0000 1111 1111 1100 0011 1011 1111
! ((r0 = (((r1 & #x40) >> 6) | #xc2))
! (r1 &= #x3f)
(r1 |= #x80)
! (write r0 r1))
!
! (if (r0 == ,(charset-id 'mule-unicode-0100-24ff))
! ((r0 = ((((r1 & #x3f80) >> 7) - 32) * 96))
! ;; #x3f80 == (0011 1111 1000 0000)b
! (r1 &= #x7f)
! (r1 += (r0 + 224)) ; 240 == -32 + #x0100
! ;; now r1 holds scalar value
! (if (r1 < #x0800)
! ;; 2byte encoding
! ((r0 = (((r1 & #x07c0) >> 6) | #xc0))
! ;; #x07c0 == (0000 0111 1100 0000)b
! (r1 &= #x3f)
! (r1 |= #x80)
! (write r0 r1))
! ;; 3byte encoding
! ((r0 = (((r1 & #xf000) >> 12) | #xe0))
! (r2 = ((r1 & #x3f) | #x80))
! (r1 &= #x0fc0)
! (r1 >>= 6)
! (r1 |= #x80)
! (write r0 r1 r2))))
!
! (if (r0 == ,(charset-id 'mule-unicode-2500-33ff))
! ((r0 = ((((r1 & #x3f80) >> 7) - 32) * 96))
! (r1 &= #x7f)
! (r1 += (r0 + 9440)) ; 9440 == -32 + #x2500
! (r0 = (((r1 & #xf000) >> 12) | #xe0))
! (r2 = ((r1 & #x3f) | #x80))
! (r1 &= #x0fc0)
! (r1 >>= 6)
! (r1 |= #x80)
! (write r0 r1 r2))
!
! (if (r0 == ,(charset-id 'mule-unicode-e000-ffff))
! ((r0 = ((((r1 & #x3f80) >> 7) - 32) * 96))
! (r1 &= #x7f)
! (r1 += (r0 + 57312)) ; 57312 == -32 + #xe000
! (r0 = (((r1 & #xf000) >> 12) | #xe0))
! (r2 = ((r1 & #x3f) | #x80))
! (r1 &= #x0fc0)
! (r1 >>= 6)
! (r1 |= #x80)
! (write r0 r1 r2))
!
! (if (r0 == ,(charset-id 'eight-bit-control))
! ;; r1 scalar utf-8
! ;; 0000 0yyy yyxx xxxx 110y yyyy 10xx xxxx
! ;; 80 0000 0000 1000 0000 1100 0010 1000 0000
! ;; 9f 0000 0000 1001 1111 1100 0010 1001 1111
! ((write #xc2)
! (write r1))
!
! (if (r0 == ,(charset-id 'eight-bit-graphic))
! ;; r1 scalar utf-8
! ;; 0000 0yyy yyxx xxxx 110y yyyy 10xx xxxx
! ;; a0 0000 0000 1010 0000 1100 0010 1010 0000
! ;; ff 0000 0000 1111 1111 1101 1111 1011 1111
! ((write r1)
! (r1 = -1)
! (read-multibyte-character r0 r1)
! (if (r0 != ,(charset-id 'eight-bit-graphic))
! (if (r0 != ,(charset-id 'eight-bit-control))
! ((r5 = r0)
! (r6 = r1))))
! (if (r5 < 0)
! ((read-multibyte-character r0 r2)
! (if (r0 != ,(charset-id 'eight-bit-graphic))
! (if (r0 != ,(charset-id 'eight-bit-control))
! ((r5 = r0)
! (r6 = r2))))
! (if (r5 < 0)
! (write r1 r2)
! (if (r1 < #xa0)
! (write r1)
! ((write #xc2)
! (write r1)))))))
!
! ((lookup-character utf-subst-table-for-encode r0 r1)
! (if r7 ; lookup succeeded
! ((r1 = (((r0 & #xf000) >> 12) | #xe0))
! (r2 = ((r0 & #x3f) | #x80))
! (r0 &= #x0fc0)
! (r0 >>= 6)
! (r0 |= #x80)
! (write r1 r0 r2))
! ;; Unsupported character.
! ;; Output U+FFFD, which is `ef bf bd' in UTF-8.
! ((write #xef)
! (write #xbf)
! (write #xbd)))))))))))
! (repeat)))
! (if (r1 >= #xa0)
! (write r1)
! (if (r1 >= #x80)
! ((write #xc2)
! (write r1)))))
"CCL program to encode into UTF-8.")
(define-ccl-program ccl-untranslated-to-ucs
`(0
! (if (r0 < #xf0) ; 3-byte encoding, as above
! ((r4 = 0)
! (r3 = (r1 & #b11000000))
! (r3 |= ((r2 >> 2) & #b00110000))
! (if (r3 != #b10100000)
! (r4 = 1)
! ((r3 = ((r0 & #x0f) << 12))
! (r3 += ((r1 & #x3f) << 6))
! (r3 += (r2 & #x3f))
! (if (r3 < #x0800)
! (r4 = 1))))
! (if (r4 != 0)
! (r0 = 0)
! (r0 = r3)))
! (if (r0 < #xf8) ; 4-byte (Mule-UCS recipe)
! ((r4 = (r1 >> 6))
! (if (r4 != #b10)
! (r0 = 0)
! ((r4 = (r2 >> 6))
! (if (r4 != #b10)
! (r0 = 0)
! ((r4 = (r3 >> 6))
! (if (r4 != #b10)
! (r0 = 0)
! ((r1 = ((r1 & #x3F) << 12))
! (r2 = ((r2 & #x3F) << 6))
! (r3 &= #x3F)
! (r0 = (((((r0 & #x07) << 18) | r1) | r2) | r3)))))))))
! (r0 = 0))))
! "Decode 3- or 4-byte sequences in r0, r1, r2 [,r3] to unicodes in r0.
! r0 == 0 for invalid sequence.")
(defvar utf-8-ccl-regs (make-vector 8 0))
--- 597,802 ----
Encodings of un-representable Unicode characters are decoded asis into
eight-bit-control and eight-bit-graphic characters.")
+ (define-ccl-program ccl-mule-utf-8-encode-untrans
+ ;; UTF-8 decoder generates an UTF-8 sequence represented by a
+ ;; sequence eight-bit-control/graphic chars for an untranslatable
+ ;; character and an invalid byte.
+ ;;
+ ;; This CCL parses that sequence (the first byte is already in r1),
+ ;; writes out the original bytes of that sequence, and sets r5 to
+ ;; -1.
+ ;;
+ ;; If the eight-bit-control/graphic sequence is shorter than what r1
+ ;; suggests, it sets r5 and r6 to the last character read that
+ ;; should be handled by the next loop of a caller.
+ ;;
+ ;; Note: For UTF-8 validation, we only check if a character is
+ ;; eight-bit-control/graphic or not. It may result in incorrect
+ ;; handling of random binary data, but such a data can't be encoded
+ ;; by UTF-8 anyway. At least, UTF-8 decoders doesn't generate such
+ ;; a sequence even if a source contains invalid byte-sequence.
+ `(0
+ (;; Read the 2nd byte.
+ (read-multibyte-character r5 r6)
+ (r0 = (r5 != ,(charset-id 'eight-bit-control)))
+ (if ((r5 != ,(charset-id 'eight-bit-graphic)) & r0)
+ ((write r1) ; invalid UTF-8
+ (r1 = -1)
+ (end)))
+
+ (if (r1 <= #xC3)
+ ;; 2-byte sequence for an originally invalid byte.
+ ((r6 &= #x3F)
+ (r6 |= ((r1 & #x1F) << 6))
+ (write r6)
+ (r5 = -1)
+ (end)))
+
+ (write r1 r6)
+ (r2 = r1)
+ (r1 = -1)
+ ;; Read the 3rd byte.
+ (read-multibyte-character r5 r6)
+ (r0 = (r5 != ,(charset-id 'eight-bit-control)))
+ (if ((r5 != ,(charset-id 'eight-bit-graphic)) & r0)
+ (end)) ; invalid UTF-8
+ (write r6)
+ (if (r2 < #xF0)
+ ;; 3-byte sequence for an untranslated character.
+ ((r5 = -1)
+ (end)))
+ ;; Read the 4th byte.
+ (read-multibyte-character r5 r6)
+ (r0 = (r5 != ,(charset-id 'eight-bit-control)))
+ (if ((r5 != ,(charset-id 'eight-bit-graphic)) & r0)
+ (end)) ; invalid UTF-8
+ ;; 4-byte sequence for an untranslated character.
+ (write r6)
+ (r5 = -1)
+ (end))
+
+ ;; At EOF...
+ ((r5 = -1)
+ (if (r1 >= 0)
+ (write r1)))))
+
(define-ccl-program ccl-encode-mule-utf-8
`(1
((r5 = -1)
(loop
(if (r5 < 0)
! (read-multibyte-character r0 r1)
! ;; Pre-read character is in r5 (charset-ID) and r6 (code-point).
! ((r0 = r5)
(r1 = r6)
(r5 = -1)))
+ (translate-character utf-translation-table-for-encode r0 r1)
(if (r0 == ,(charset-id 'ascii))
! (write-repeat r1))
! (if (r0 == ,(charset-id 'latin-iso8859-1))
! ;; r1 scalar utf-8
! ;; 0000 0yyy yyxx xxxx 110y yyyy 10xx xxxx
! ;; 20 0000 0000 1010 0000 1100 0010 1010 0000
! ;; 7f 0000 0000 1111 1111 1100 0011 1011 1111
! ((r0 = (((r1 & #x40) >> 6) | #xc2))
! (r1 &= #x3f)
! (r1 |= #x80)
! (write r0)
! (write-repeat r1)))
!
! (if (r0 == ,(charset-id 'mule-unicode-0100-24ff))
! ((r0 = ((((r1 & #x3f80) >> 7) - 32) * 96))
! ;; #x3f80 == (0011 1111 1000 0000)b
! (r1 &= #x7f)
! (r1 += (r0 + 224)) ; 240 == -32 + #x0100
! ;; now r1 holds scalar value
! (if (r1 < #x0800)
! ;; 2byte encoding
! ((write ((r1 >> 6) | #xC0))
! (r1 &= #x3F)
! (r1 |= #x80)
! (write-repeat r1))
! ;; 3byte encoding
! ((write ((r1 >> 12) | #xE0))
! (write (((r1 & #x0FC0) >> 6) | #x80))
! (r1 &= #x3F)
! (r1 |= #x80)
! (write-repeat r1)))))
!
! (if (r0 == ,(charset-id 'mule-unicode-2500-33ff))
! ((r0 = ((((r1 & #x3f80) >> 7) - 32) * 96))
! (r1 &= #x7f)
! (r1 += (r0 + 9440)) ; 9440 == -32 + #x2500
! ;; now r1 holds scalar value
! (write ((r1 >> 12) | #xE0))
! (write (((r1 & #x0FC0) >> 6) | #x80))
! (r1 &= #x3F)
! (r1 |= #x80)
! (write-repeat r1)))
!
! (if (r0 == ,(charset-id 'mule-unicode-e000-ffff))
! ((r0 = ((((r1 & #x3f80) >> 7) - 32) * 96))
! (r1 &= #x7f)
! (r1 += (r0 + 57312)) ; 57312 == -32 + #xe000
! ;; now r1 holds scalar value
! (write ((r1 >> 12) | #xE0))
! (write (((r1 & #x0FC0) >> 6) | #x80))
! (r1 &= #x3F)
! (r1 |= #x80)
! (write-repeat r1)))
!
! (if (r0 == ,(charset-id 'eight-bit-control))
! ;; r1 scalar utf-8
! ;; 0000 0yyy yyxx xxxx 110y yyyy 10xx xxxx
! ;; 80 0000 0000 1000 0000 1100 0010 1000 0000
! ;; 9f 0000 0000 1001 1111 1100 0010 1001 1111
! ((write #xC2)
! (write-repeat r1)))
!
! (if (r0 == ,(charset-id 'eight-bit-graphic))
! ;; r1 scalar utf-8
! ;; 0000 0yyy yyxx xxxx 110y yyyy 10xx xxxx
! ;; a0 0000 0000 1010 0000 1100 0010 1010 0000
! ;; ff 0000 0000 1111 1111 1101 1111 1011 1111
! ((r0 = (r1 >= #xC0))
! (r0 &= (r1 <= #xC3))
! (r4 = (r1 >= #xE1))
! (r4 &= (r1 <= #xF7))
! (r0 |= r4)
! (if r0
! ((call ccl-mule-utf-8-encode-untrans)
! (repeat))
! (write-repeat r1))))
!
! (lookup-character utf-subst-table-for-encode r0 r1)
! (if r7 ; lookup succeeded
! (if (r0 < #x800)
! ;; 2byte encoding
! ((write ((r0 >> 6) | #xC0))
! (r1 &= #x3F)
! (r1 |= #x80)
! (write-repeat r1))
! ;; 3byte encoding
! ((write ((r0 >> 12) | #xE0))
! (write (((r0 & #x0FC0) >> 6) | #x80))
! (r1 &= #x3F)
(r1 |= #x80)
! (write-repeat r1))))
+ ;; Unsupported character.
+ ;; Output U+FFFD, which is `ef bf bd' in UTF-8.
+ (write #xef)
+ (write #xbf)
+ (write-repeat #xbd))))
"CCL program to encode into UTF-8.")
(define-ccl-program ccl-untranslated-to-ucs
`(0
! (if (r1 == 0)
! nil
! (if (r0 <= #xC3) ; 2-byte encoding
! ((r0 = ((r0 & #x3) << 6))
! (r0 |= (r1 & #x3F))
! (r1 = 2))
! (if (r2 == 0)
! (r1 = 0)
! (if (r0 < #xF0) ; 3-byte encoding, as above
! ((r0 = ((r0 & #xF) << 12))
! (r0 |= ((r1 & #x3F) << 6))
! (r0 |= (r1 & #x3F))
! (r1 = 3))
! (if (r3 == 0)
! (r1 = 0)
! ((r0 = ((r0 & #x7) << 18))
! (r0 |= ((r1 & #x3F) << 12))
! (r0 |= ((r2 & #x3F) << 6))
! (r0 |= (r3 & #x3F))
! (r1 = 4))))))))
! "Decode 2-, 3-, or 4-byte sequences in r0, r1, r2 [,r3] to unicodes in r0.
! Set r1 to the byte length. r0 == 0 for invalid sequence.")
(defvar utf-8-ccl-regs (make-vector 8 0))
***************
*** 708,740 ****
(aset utf-8-ccl-regs 1 (or (char-after (1+ (point))) 0))
(aset utf-8-ccl-regs 2 (or (char-after (+ 2 (point))) 0))
(aset utf-8-ccl-regs 3 (or (char-after (+ 3 (point))) 0))
! (ccl-execute 'ccl-untranslated-to-ucs utf-8-ccl-regs)
! (aref utf-8-ccl-regs 0))
(defun utf-8-help-echo (window object position)
(format "Untranslated Unicode U+%04X"
(get-char-property position 'untranslated-utf-8 object)))
! ;; We compose the untranslatable sequences into a single character.
;; This is infelicitous for editing, because there's currently no
;; mechanism for treating compositions as atomic, but is OK for
;; display. They are composed to U+FFFD with help-echo which
;; indicates the unicodes they represent. This function GCs too much.
! (defsubst utf-8-compose ()
! "Put a suitable composition on an untranslatable sequence.
! Return the sequence's length."
! (let* ((u (utf-8-untranslated-to-ucs))
! (l (unless (zerop u)
! (if (>= u #x10000)
! 4
! 3))))
! (when l
! (put-text-property (point) (min (point-max) (+ l (point)))
! 'untranslated-utf-8 u)
! (put-text-property (point) (min (point-max) (+ l (point)))
! 'help-echo 'utf-8-help-echo)
! (compose-region (point) (+ l (point)) ?$,3u=(B)
! l)))
(defcustom utf-8-compose-scripts nil
"*Non-nil means compose various scripts on decoding utf-8 text."
--- 807,853 ----
(aset utf-8-ccl-regs 1 (or (char-after (1+ (point))) 0))
(aset utf-8-ccl-regs 2 (or (char-after (+ 2 (point))) 0))
(aset utf-8-ccl-regs 3 (or (char-after (+ 3 (point))) 0))
! (ccl-execute 'ccl-untranslated-to-ucs utf-8-ccl-regs))
(defun utf-8-help-echo (window object position)
(format "Untranslated Unicode U+%04X"
(get-char-property position 'untranslated-utf-8 object)))
! ;; We compose the untranslatable sequences into a single character,
! ;; and move point to the next character.
;; This is infelicitous for editing, because there's currently no
;; mechanism for treating compositions as atomic, but is OK for
;; display. They are composed to U+FFFD with help-echo which
;; indicates the unicodes they represent. This function GCs too much.
!
! ;; If utf-translate-cjk-mode is non-nil, this function is called with
! ;; HASH-TABLE which translates CJK characters into some of CJK
! ;; charsets.
!
! (defsubst utf-8-compose (hash-table)
! "Put a suitable composition on an untranslatable sequence at point.
! If HASH-TABLE is non-nil, try to translate CJK characters by it at first.
! Move point to the end of the sequence."
! (utf-8-untranslated-to-ucs)
! (let ((l (aref utf-8-ccl-regs 1))
! ch)
! (if (> l 0)
! (if (and hash-table
! (setq ch (gethash (aref utf-8-ccl-regs 0) hash-table)))
! (progn
! (insert ch)
! (delete-region (point) (min (point-max) (+ l (point)))))
! (setq ch (aref utf-8-ccl-regs 0))
! (put-text-property (point) (min (point-max) (+ l (point)))
! 'untranslated-utf-8 ch)
! (put-text-property (point) (min (point-max) (+ l (point)))
! 'help-echo 'utf-8-help-echo)
! (if (= l 2)
! (put-text-property (point) (min (point-max) (+ l (point)))
! 'display (format "\\%03o" ch))
! (compose-region (point) (+ l (point)) ?$,3u=(B))
! (forward-char l))
! (forward-char 1))))
(defcustom utf-8-compose-scripts nil
"*Non-nil means compose various scripts on decoding utf-8 text."
***************
*** 744,781 ****
(defun utf-8-post-read-conversion (length)
"Compose untranslated utf-8 sequences into single characters.
Also compose particular scripts if `utf-8-compose-scripts' is non-nil."
(save-excursion
! ;; Can't do eval-when-compile to insert a multibyte constant
! ;; version of the string in the loop, since it's always loaded as
! ;; unibyte from a byte-compiled file.
! (let ((range (string-as-multibyte "^\xe1-\xf7")))
! (while (and (skip-chars-forward range)
! (not (eobp)))
! (forward-char (utf-8-compose)))))
! ;; Fixme: Takahashi-san implies it may not work this easily. I
! ;; asked why but didn't get a reply. -- fx
! (when (and utf-8-compose-scripts (> length 1))
! ;; These currently have definitions which cover the relevant
! ;; unicodes. We could avoid loading thai-util &c by checking
! ;; whether the region contains any characters with the appropriate
! ;; categories. There aren't yet Unicode-based rules for Tibetan.
! (save-excursion (setq length (diacritic-post-read-conversion length)))
! (save-excursion (setq length (thai-post-read-conversion length)))
! (save-excursion (setq length (lao-post-read-conversion length)))
! (save-excursion (setq length (devanagari-post-read-conversion length)))
! (save-excursion (setq length (malayalam-post-read-conversion length)))
! (save-excursion (setq length (tamil-post-read-conversion length))))
! length)
!
! ;; ucs-tables is preloaded
! ;; (defun utf-8-pre-write-conversion (beg end)
! ;; "Semi-dummy pre-write function effectively to autoload ucs-tables."
! ;; ;; Ensure translation-table is loaded.
! ;; (require 'ucs-tables)
! ;; ;; Don't do this again.
! ;; (coding-system-put 'mule-utf-8 'pre-write-conversion nil)
! ;; nil)
(make-coding-system
'mule-utf-8 4 ?u
--- 857,919 ----
(defun utf-8-post-read-conversion (length)
"Compose untranslated utf-8 sequences into single characters.
+ If `utf-translate-cjk-mode' is non-nil, tries to translate CJK characters.
Also compose particular scripts if `utf-8-compose-scripts' is non-nil."
(save-excursion
! (save-restriction
! (narrow-to-region (point) (+ (point) length))
! ;; Can't do eval-when-compile to insert a multibyte constant
! ;; version of the string in the loop, since it's always loaded as
! ;; unibyte from a byte-compiled file.
! (let ((range (string-as-multibyte "^\xc0-\xc3\xe1-\xf7"))
! hash-table ch)
! (when utf-translate-cjk-mode
! (if (not utf-translate-cjk-lang-env)
! ;; Check these characters:
! ;; "U+2e80-U+33ff", "U+ff00-U+ffef"
! ;; We may have to translate them to CJK charsets.
! (let ((range2 "$,address@hidden(B-$,2G$,3r`(B-$,3u/(B"))
! (skip-chars-forward (concat range range2))
! (unless (eobp)
! (utf-translate-cjk-load-tables)
! (setq range (concat range range2)))
! (setq hash-table (get 'utf-subst-table-for-decode
! 'translation-hash-table)))))
! (while (and (skip-chars-forward range)
! (not (eobp)))
! (setq ch (following-char))
! (if (< ch 256)
! (utf-8-compose hash-table)
! (if (and hash-table
! (setq ch (gethash (encode-char ch 'ucs) hash-table)))
! (progn
! (insert ch)
! (delete-char 1))
! (forward-char 1)))))
!
! (when (and utf-8-compose-scripts (> length 1))
! ;; These currently have definitions which cover the relevant
! ;; unicodes. We could avoid loading thai-util &c by checking
! ;; whether the region contains any characters with the appropriate
! ;; categories. There aren't yet Unicode-based rules for Tibetan.
! (diacritic-compose-region (point-max) (point-min))
! (thai-compose-region (point-max) (point-min))
! (lao-compose-region (point-max) (point-min))
! (devanagari-compose-region (point-max) (point-min))
! (malayalam-compose-region (point-max) (point-min))
! (tamil-compose-region (point-max) (point-min)))
! (- (point-max) (point-min)))))
!
! (defun utf-8-pre-write-conversion (beg end)
! "Prepare for `utf-translate-cjk-mode' to encode text between BEG and END.
! This is used as a post-read-conversion of utf-8 coding system."
! (if (and utf-translate-cjk-mode
! (not utf-translate-cjk-lang-env)
! (save-excursion
! (goto-char beg)
! (re-search-forward "\\cc\\|\\cj\\|\\ch" end t)))
! (utf-translate-cjk-load-tables))
! nil)
(make-coding-system
'mule-utf-8 4 ?u
***************
*** 797,814 ****
sequence representing U+FFFD (REPLACEMENT CHARACTER)."
'(ccl-decode-mule-utf-8 . ccl-encode-mule-utf-8)
! '((safe-charsets
ascii
eight-bit-control
eight-bit-graphic
latin-iso8859-1
mule-unicode-0100-24ff
mule-unicode-2500-33ff
! mule-unicode-e000-ffff)
(mime-charset . utf-8)
(coding-category . coding-category-utf-8)
(valid-codes (0 . 255))
! ;; (pre-write-conversion . utf-8-pre-write-conversion)
(post-read-conversion . utf-8-post-read-conversion)
(translation-table-for-encode . utf-translation-table-for-encode)
(dependency unify-8859-on-encoding-mode
--- 935,954 ----
sequence representing U+FFFD (REPLACEMENT CHARACTER)."
'(ccl-decode-mule-utf-8 . ccl-encode-mule-utf-8)
! `((safe-charsets
ascii
eight-bit-control
eight-bit-graphic
latin-iso8859-1
mule-unicode-0100-24ff
mule-unicode-2500-33ff
! mule-unicode-e000-ffff
! ,@(if utf-translate-cjk-mode
! utf-translate-cjk-charsets))
(mime-charset . utf-8)
(coding-category . coding-category-utf-8)
(valid-codes (0 . 255))
! (pre-write-conversion . utf-8-pre-write-conversion)
(post-read-conversion . utf-8-post-read-conversion)
(translation-table-for-encode . utf-translation-table-for-encode)
(dependency unify-8859-on-encoding-mode
- [Emacs-diffs] Changes to emacs/lisp/international/utf-8.el,
Kenichi Handa <=