[Top][All Lists]
[Date Prev][Date Next][Thread Prev][Thread Next][Date Index][Thread Index]
Re: enhancements to utf-8.el
From: |
Dave Love |
Subject: |
Re: enhancements to utf-8.el |
Date: |
04 Nov 2001 15:34:34 +0000 |
User-agent: |
Gnus/5.09 (Gnus v5.9.0) Emacs/21.0.107 |
The diff I posted previously required ucs-tables to be loaded
explicitly after the patched utf-8. This new diff against the 21.1
original should DTRT, it being closer to what I actually run.
Index: utf-8.el
===================================================================
RCS file: /cvs/emacs/lisp/international/utf-8.el,v
retrieving revision 1.9
diff -u -p -r1.9 utf-8.el
--- utf-8.el 2001/07/16 12:22:59 1.9
+++ utf-8.el 2001/11/04 15:25:44
@@ -1,4 +1,4 @@
-;;; utf-8.el --- limited UTF-8 decoding/encoding support
+;;; utf-8.el --- Limited UTF-8 decoding/encoding support -*- coding:
iso-2022-7bit-*-
;; Copyright (C) 2001 Electrotechnical Laboratory, JAPAN.
;; Licensed to the Free Software Foundation.
@@ -64,6 +64,7 @@
;; ascii | 1 | 1
;; -----------------------+----------------+---------------
;; eight-bit-control | 2 | 2
+ ;; eight-bit-graphic | 2 | 1
;; latin-iso8859-1 | 2 | 2
;; -----------------------+----------------+---------------
;; mule-unicode-0100-24ff | 2 | 4
@@ -228,7 +229,8 @@ characters.")
(loop
(if (r5 < 0)
((r1 = -1)
- (read-multibyte-character r0 r1))
+ (read-multibyte-character r0 r1)
+ (translate-character ucs-mule-8859-to-mule-unicode r0 r1))
(;; We have already done read-multibyte-character.
(r0 = r5)
(r1 = r6)
@@ -340,26 +342,105 @@ Only characters from the charsets ascii,
eight-bit-graphic, latin-iso8859-1 and mule-unicode-* are recognized.
Others are encoded as U+FFFD.")
+;; Dummy definition so that the CCL can be checked correctly; the
+;; actual data are loaded on demand.
+(unless (boundp 'ucs-mule-8859-to-mule-unicode) ; don't zap it
+ (define-translation-table 'ucs-mule-8859-to-mule-unicode))
+
+(defsubst utf-8-untranslated-to-ucs ()
+ (let ((b1 (char-after))
+ (b2 (char-after (1+ (point))))
+ (b3 (char-after (+ 2 (point))))
+ (b4 (char-after (+ 4 (point)))))
+ (if (and b1 b2 b3)
+ (cond ((< b1 ?\xf0)
+ (setq b2 (lsh (logand b2 ?\x3f) 6))
+ (setq b3 (logand b3 ?\x3f))
+ (logior b3 (logior b2 (lsh (logand b1 ?\x0f) 12))))
+ (b4
+ (setq b2 (lsh (logand b2 ?\x3f) 12))
+ (setq b3 (lsh (logand b3 ?\x3f) 6))
+ (setq b4 (logand b4 ?\x3f))
+ (logior b4 (logior b3 (logior b2 (lsh (logand b1 ?\x07)
+ 18)))))))))
+
+(defun utf-8-help-echo (window object position)
+ (format "Untranslated Unicode U+%04X"
+ (get-char-property position 'untranslated-utf-8 object)))
+
+(defvar utf-8-subst-table nil
+ "If non-nil, a hash table mapping `untranslatable utf-8' to Emacs
characters.")
+
+;; We compose the untranslatable sequences into a single character.
+;; This is infelicitous for editing, because there's currently no
+;; mechanism for treating compositions as atomic, but is OK for
+;; display. We try to compose an appropriate character from a hash
+;; table of CJK characters to display correctly. Otherwise we use
+;; U+FFFD. What we really should have is hash table lookup from CCL
+;; so that we could do this properly. This function GCs too much.
+(defsubst utf-8-compose ()
+ "Put a suitable composition on an untranslatable sequence.
+Return the sequence's length."
+ (let* ((u (utf-8-untranslated-to-ucs))
+ (l (and u (if (>= u ?\x10000)
+ 4
+ 3)))
+ (subst (and utf-8-subst-table (gethash u utf-8-subst-table))))
+ (when u
+ (put-text-property (point) (min (point-max) (+ l (point)))
+ 'untranslated-utf-8 u)
+ (unless subst
+ (put-text-property (point) (min (point-max) (+ l (point)))
+ 'help-echo 'utf-8-help-echo)
+ (setq subst ?$,3u=(B))
+ (compose-region (point) (+ l (point)) subst)
+ l)))
+
+(defun utf-8-post-read-conversion (length)
+ "Compose untranslated utf-8 sequences into single characters."
+ (save-excursion
+ (while (and (skip-chars-forward
+ (eval-and-compile ; missing optimization
+ (string-as-multibyte "^\341-\377")))
+ (not (eobp)))
+ (forward-char (utf-8-compose))))
+ length)
+
+(defun utf-8-pre-write-conversion (beg end)
+ "Semi-dummy pre-write function effectively to autoload ucs-tables."
+ ;; Ensure translation table is loaded.
+ (require 'ucs-tables)
+ ;; Don't do this again.
+ (coding-system-put 'mule-utf-8 'pre-write-conversion nil)
+ nil)
+
(make-coding-system
'mule-utf-8 4 ?u
"UTF-8 encoding for Emacs-supported Unicode characters.
-The supported Emacs character sets are:
+The supported Emacs character sets are the following, determined by the
+translation table `ucs-mule-8859-to-mule-unicode':
ascii
eight-bit-control
eight-bit-graphic
latin-iso8859-1
+ latin-iso8859-2
+ latin-iso8859-3
+ latin-iso8859-4
+ cyrillic-iso8859-5
+ greek-iso8859-7
+ hebrew-iso8859-8
+ latin-iso8859-9
+ latin-iso8859-14
+ latin-iso8859-15
mule-unicode-0100-24ff
mule-unicode-2500-33ff
mule-unicode-e000-ffff
Unicode characters out of the ranges U+0000-U+33FF and U+E200-U+FFFF
are decoded into sequences of eight-bit-control and eight-bit-graphic
-characters to preserve their byte sequences. Emacs characters out of
-these ranges are encoded into U+FFFD.
-
-Note that, currently, characters in the mule-unicode charsets have no
-syntax and case information. Thus, for instance, upper- and
-lower-casing commands won't work with them."
+characters to preserve their byte sequences and composed to behave as
+a single character when editing. Emacs characters out of these ranges
+are encoded into U+FFFD."
'(ccl-decode-mule-utf-8 . ccl-encode-mule-utf-8)
'((safe-charsets
@@ -367,12 +448,23 @@ lower-casing commands won't work with th
eight-bit-control
eight-bit-graphic
latin-iso8859-1
+ latin-iso8859-15
+ latin-iso8859-14
+ latin-iso8859-9
+ hebrew-iso8859-8
+ greek-iso8859-7
+ cyrillic-iso8859-5
+ latin-iso8859-4
+ latin-iso8859-3
+ latin-iso8859-2
mule-unicode-0100-24ff
mule-unicode-2500-33ff
mule-unicode-e000-ffff)
(mime-charset . utf-8)
(coding-category . coding-category-utf-8)
- (valid-codes (0 . 255))))
+ (valid-codes (0 . 255))
+ (pre-write-conversion . utf-8-pre-write-conversion)
+ (post-read-conversion . utf-8-post-read-conversion)))
(define-coding-system-alias 'utf-8 'mule-utf-8)
[Prev in Thread] |
Current Thread |
[Next in Thread] |
- Re: enhancements to utf-8.el,
Dave Love <=