emacs-diffs
[Top][All Lists]
Advanced

[Date Prev][Date Next][Thread Prev][Thread Next][Date Index][Thread Index]

[Emacs-diffs] Changes to emacs/lisp/international/utf-16.el [lexbind]


From: Miles Bader
Subject: [Emacs-diffs] Changes to emacs/lisp/international/utf-16.el [lexbind]
Date: Tue, 14 Oct 2003 19:39:36 -0400

Index: emacs/lisp/international/utf-16.el
diff -c emacs/lisp/international/utf-16.el:1.8.2.1 
emacs/lisp/international/utf-16.el:1.8.2.2
*** emacs/lisp/international/utf-16.el:1.8.2.1  Fri Apr  4 01:20:26 2003
--- emacs/lisp/international/utf-16.el  Tue Oct 14 19:39:23 2003
***************
*** 1,6 ****
  ;;; utf-16.el --- UTF-16 encoding/decoding
  
! ;; Copyright (C) 2001, 2002 Free Software Foundation, Inc.
  
  ;; Author: Dave Love <address@hidden>
  ;; Keywords: Unicode, UTF-16, i18n
--- 1,6 ----
  ;;; utf-16.el --- UTF-16 encoding/decoding
  
! ;; Copyright (C) 2001, 2002, 2003 Free Software Foundation, Inc.
  
  ;; Author: Dave Love <address@hidden>
  ;; Keywords: Unicode, UTF-16, i18n
***************
*** 25,37 ****
  ;;; Commentary:
  
  ;; Support for UTF-16, which is a two-byte encoding (modulo
! ;; surrogates) of Unicode, written either in little or big endian
! ;; order: coding-systems `mule-utf-16-le' and `mule-utf-16-be'.
! ;; (utf-16-le is used by the DozeN'T clipboard, for instance.)  The
! ;; data are preceeded by a two-byte signature which identifies their
! ;; byte sex.  These are used by the coding-category-utf-16-{b,l}e code
! ;; to identify the coding, but ignored on decoding.
! 
  ;; Note that un-decodable sequences aren't (yet?) preserved as raw
  ;; bytes, as they are with utf-8, so reading and writing as utf-16 can
  ;; corrupt data.
--- 25,43 ----
  ;;; Commentary:
  
  ;; Support for UTF-16, which is a two-byte encoding (modulo
! ;; surrogates) of Unicode, defined in RFC 2781.  It is written either
! ;; in little or big endian order and either with or without the
! ;; leading BOM (a two-byte signature which identifies their byte sex).
! ;;
! ;; We provide these base coding systems.
! ;;    name                                    endian  BOM
! ;;    ----                                    ------  ---
! ;;    mule-utf-16le                           little  no
! ;;    mule-utf-16be                           big     no
! ;;    mule-utf-16le-with-signature            little  yes
! ;;    mule-utf-16be-with-signature            big     yes
! ;;    mule-utf-16                             both    yes
! ;;
  ;; Note that un-decodable sequences aren't (yet?) preserved as raw
  ;; bytes, as they are with utf-8, so reading and writing as utf-16 can
  ;; corrupt data.
***************
*** 69,76 ****
  (eval-and-compile
  (defconst utf-16-decode-ucs
    ;; We have the unicode in r1.  Output is charset ID in r0, code
!   ;; point in r1.  As r6 keeps endian information, the value should
!   ;; not be changed.
    `((lookup-integer utf-subst-table-for-decode r1 r3)
      (if r7                            ; got a translation
        ((r0 = r1) (r1 = r3))
--- 75,81 ----
  (eval-and-compile
  (defconst utf-16-decode-ucs
    ;; We have the unicode in r1.  Output is charset ID in r0, code
!   ;; point in r1.
    `((lookup-integer utf-subst-table-for-decode r1 r3)
      (if r7                            ; got a translation
        ((r0 = r1) (r1 = r3))
***************
*** 111,149 ****
                     (r1 -= #xe000)
                     (r2 = (((r1 / 96) + 32) << 7))
                     (r1 %= 96)
!                    (r1 += (r2 + 32))))))))))))))
  
! (define-ccl-program ccl-decode-mule-utf-16-le
    `(2                                 ; 2 bytes -> 1 to 4 bytes
!     ((loop
!       (read r3 r4)
!       (r1 = (r4 <8 r3))
!       ,utf-16-decode-ucs
!       (translate-character utf-translation-table-for-decode r0 r1)
!       (write-multibyte-character r0 r1)
!       (repeat))))
    "Decode UTF-16LE (little endian without signature bytes).
  Basic decoding is done into the charsets ascii, latin-iso8859-1 and
  mule-unicode-*.  Un-representable Unicode characters are decoded as
  U+fffd.  The result is run through the translation-table named
  `utf-translation-table-for-decode'.")
  
! (define-ccl-program ccl-decode-mule-utf-16-be
    `(2                                 ; 2 bytes -> 1 to 4 bytes
!     ((loop
!       (read r3 r4)
!       (r1 = (r3 <8 r4))
!       ,utf-16-decode-ucs
!       (translate-character utf-translation-table-for-decode r0 r1)
!       (write-multibyte-character r0 r1)
!       (repeat))))
    "Decode UTF-16BE (big endian without signature bytes).
  Basic decoding is done into the charsets ascii, latin-iso8859-1 and
  mule-unicode-*.  Un-representable Unicode characters are
  decoded as U+fffd.  The result is run through the translation-table of
  name `utf-translation-table-for-decode'.")
  
  (makunbound 'utf-16-decode-ucs)               ; done with it
  
  (eval-and-compile
  (defconst utf-16-decode-to-ucs
--- 116,201 ----
                     (r1 -= #xe000)
                     (r2 = (((r1 / 96) + 32) << 7))
                     (r1 %= 96)
!                    (r1 += (r2 + 32)))))))))))))
! 
! (defconst utf-16le-decode-loop
!   `(loop
!     (read r3 r4)
!     (r1 = (r4 <8 r3))
!     ,utf-16-decode-ucs
!     (translate-character utf-translation-table-for-decode r0 r1)
!     (write-multibyte-character r0 r1)
!     (repeat)))
! 
! (defconst utf-16be-decode-loop
!   `(loop
!     (read r3 r4)
!     (r1 = (r3 <8 r4))
!     ,@utf-16-decode-ucs
!     (translate-character utf-translation-table-for-decode r0 r1)
!     (write-multibyte-character r0 r1)
!     (repeat)))
! 
! )
  
! (define-ccl-program ccl-decode-mule-utf-16le
    `(2                                 ; 2 bytes -> 1 to 4 bytes
!     ,utf-16le-decode-loop)
    "Decode UTF-16LE (little endian without signature bytes).
  Basic decoding is done into the charsets ascii, latin-iso8859-1 and
  mule-unicode-*.  Un-representable Unicode characters are decoded as
  U+fffd.  The result is run through the translation-table named
  `utf-translation-table-for-decode'.")
  
! (define-ccl-program ccl-decode-mule-utf-16be
    `(2                                 ; 2 bytes -> 1 to 4 bytes
!     ,utf-16be-decode-loop)
    "Decode UTF-16BE (big endian without signature bytes).
  Basic decoding is done into the charsets ascii, latin-iso8859-1 and
  mule-unicode-*.  Un-representable Unicode characters are
  decoded as U+fffd.  The result is run through the translation-table of
  name `utf-translation-table-for-decode'.")
  
+ (define-ccl-program ccl-decode-mule-utf-16le-with-signature
+   `(2
+     ((read r3 r4)
+      ,utf-16le-decode-loop))
+   "Like ccl-decode-utf-16le but skip the first 2-byte BOM.")
+ 
+ (define-ccl-program ccl-decode-mule-utf-16be-with-signature
+   `(2
+     ((read r3 r4)
+      ,utf-16be-decode-loop))
+   "Like ccl-decode-utf-16be but skip the first 2-byte BOM.")
+ 
+ (define-ccl-program ccl-decode-mule-utf-16
+   `(2
+     ((read r3 r4)
+      (r1 = (r3 <8 r4))
+      (if (r1 == #xFFFE)
+        ;; R1 is a BOM for little endian.  We keep this character as
+        ;; is temporarily.  It is removed by post-read-conversion
+        ;; function.
+        (,@utf-16-decode-ucs
+         (write-multibyte-character r0 r1)
+         ,utf-16le-decode-loop)
+        ((if (r1 == #xFEFF)
+           ;; R1 is a BOM for big endian, but we can't keep that
+           ;; character in the output because it can't be
+           ;; distinguished with the normal U+FEFF.  So, we keep
+           ;; #xFFFF instead.
+           ((r1 = #xFFFF)
+            ,@utf-16-decode-ucs)
+         ;; R1 a normal Unicode character.
+         (,@utf-16-decode-ucs
+          (translate-character utf-translation-table-for-decode r0 r1)))
+       (write-multibyte-character r0 r1)
+       ,utf-16be-decode-loop))))
+   "Like ccl-decode-utf-16be/le but check the first BOM.")
+ 
  (makunbound 'utf-16-decode-ucs)               ; done with it
+ (makunbound 'utf-16le-decode-loop)
+ (makunbound 'utf-16be-decode-loop)
  
  (eval-and-compile
  (defconst utf-16-decode-to-ucs
***************
*** 168,186 ****
                  (r0 = (r3 + #x2500))
                (if (r0 == ,(charset-id 'mule-unicode-e000-ffff))
                    (r0 = (r3 + #xe000))
!                 (r0 = #xfffd)))))))))))
  
! (define-ccl-program ccl-encode-mule-utf-16-le
    `(1
!     ((loop
!       (read-multibyte-character r0 r1)
!       (lookup-character utf-subst-table-for-encode r0 r1)
!       (if (r7 == 0)
!         ((translate-character utf-translation-table-for-encode r0 r1)
!          ,utf-16-decode-to-ucs))
!       (write (r0 & 255))
!       (write (r0 >> 8))
!       (repeat))))
    "Encode to UTF-16LE (little endian without signature).
  Characters from the charsets ascii, eight-bit-control,
  eight-bit-graphic, latin-iso8859-1 and mule-unicode-* are encoded
--- 220,254 ----
                  (r0 = (r3 + #x2500))
                (if (r0 == ,(charset-id 'mule-unicode-e000-ffff))
                    (r0 = (r3 + #xe000))
!                 (r0 = #xfffd))))))))))
! 
! (defconst utf-16le-encode-loop
!   `(loop
!     (read-multibyte-character r0 r1)
!     (lookup-character utf-subst-table-for-encode r0 r1)
!     (if (r7 == 0)
!       ((translate-character utf-translation-table-for-encode r0 r1)
!        ,utf-16-decode-to-ucs))
!     (write (r0 & 255))
!     (write (r0 >> 8))
!     (repeat)))
! 
! (defconst utf-16be-encode-loop
!   `(loop
!     (read-multibyte-character r0 r1)
!     (lookup-character utf-subst-table-for-encode r0 r1)
!     (if (r7 == 0)
!       ((translate-character utf-translation-table-for-encode r0 r1)
!        ,utf-16-decode-to-ucs))
!     (write (r0 >> 8))
!     (write (r0 & 255))
!     (repeat)))
! )
  
! 
! (define-ccl-program ccl-encode-mule-utf-16le
    `(1
!     ,utf-16le-encode-loop)
    "Encode to UTF-16LE (little endian without signature).
  Characters from the charsets ascii, eight-bit-control,
  eight-bit-graphic, latin-iso8859-1 and mule-unicode-* are encoded
***************
*** 188,204 ****
  `utf-translation-table-for-encode'.
  Others are encoded as U+FFFD.")
  
! (define-ccl-program ccl-encode-mule-utf-16-be
    `(1
!     ((loop
!       (read-multibyte-character r0 r1)
!       (lookup-character utf-subst-table-for-encode r0 r1)
!       (if (r7 == 0)
!         ((translate-character utf-translation-table-for-encode r0 r1)
!          ,utf-16-decode-to-ucs))
!       (write (r0 >> 8))
!       (write (r0 & 255))
!       (repeat))))
    "Encode to UTF-16BE (big endian without signature).
  Characters from the charsets ascii, eight-bit-control,
  eight-bit-graphic, latin-iso8859-1 and mule-unicode-* are encoded
--- 256,264 ----
  `utf-translation-table-for-encode'.
  Others are encoded as U+FFFD.")
  
! (define-ccl-program ccl-encode-mule-utf-16be
    `(1
!     ,utf-16be-encode-loop)
    "Encode to UTF-16BE (big endian without signature).
  Characters from the charsets ascii, eight-bit-control,
  eight-bit-graphic, latin-iso8859-1 and mule-unicode-* are encoded
***************
*** 206,212 ****
--- 266,319 ----
  `utf-translation-table-for-encode'.
  Others are encoded as U+FFFD.")
  
+ (define-ccl-program ccl-encode-mule-utf-16le-with-signature
+   `(1
+     ((write #xFF)
+      (write #xFE)
+      ,utf-16le-encode-loop))
+   "Encode to UTF-16 (little endian with signature).
+ Characters from the charsets ascii, eight-bit-control,
+ eight-bit-graphic, latin-iso8859-1 and mule-unicode-* are encoded
+ after translation through the translation-table of name
+ `utf-translation-table-for-encode'.
+ Others are encoded as U+FFFD.")
+ 
+ (define-ccl-program ccl-encode-mule-utf-16be-with-signature
+   `(1
+     ((write #xFE)
+      (write #xFF)
+      ,utf-16be-encode-loop))
+   "Encode to UTF-16 (big endian with signature).
+ Characters from the charsets ascii, eight-bit-control,
+ eight-bit-graphic, latin-iso8859-1 and mule-unicode-* are encoded
+ after translation through the translation-table named
+ `utf-translation-table-for-encode'.
+ Others are encoded as U+FFFD.")
+ 
  (makunbound 'utf-16-decode-to-ucs)
+ (makunbound 'utf-16le-encode-loop)
+ (makunbound 'utf-16be-encode-loop)
+ 
+ (defun mule-utf-16-post-read-conversion (length)
+   (when (> length 0)
+     (let ((char (following-char)))
+       (cond ((= char (decode-char 'ucs #xFFFE))
+            (delete-char 1)
+            (setq last-coding-system-used
+                  (coding-system-change-text-conversion
+                   last-coding-system-used
+                   'mule-utf-16le-with-signature))
+            (setq length (1- length)))
+           ((= char (decode-char 'ucs #xFFFF))
+            (delete-char 1)
+            (setq last-coding-system-used
+                  (coding-system-change-text-conversion
+                   last-coding-system-used
+                   'mule-utf-16be-with-signature))
+            (setq length (1- length)))
+           (t
+            (setq last-coding-system-used 'mule-utf-16be)))))
+   length)
  
  (let ((doc "
  
***************
*** 224,236 ****
  any of the character sets listed above are encoded into the byte
  sequence representing U+FFFD (REPLACEMENT CHARACTER)."))
    (make-coding-system
!    'mule-utf-16-le 4
     ?u       ; Mule-UCS uses ?U, but code-pages uses that for koi8-u.
     (concat
!     "Little endian UTF-16 encoding for Emacs-supported Unicode characters."
      doc)
  
!    '(ccl-decode-mule-utf-16-le . ccl-encode-mule-utf-16-le)
     '((safe-charsets
        ascii
        eight-bit-control
--- 331,343 ----
  any of the character sets listed above are encoded into the byte
  sequence representing U+FFFD (REPLACEMENT CHARACTER)."))
    (make-coding-system
!    'mule-utf-16le 4
     ?u       ; Mule-UCS uses ?U, but code-pages uses that for koi8-u.
     (concat
!     "UTF-16LE encoding for Emacs-supported Unicode characters."
      doc)
  
!    '(ccl-decode-mule-utf-16le . ccl-encode-mule-utf-16le)
     '((safe-charsets
        ascii
        eight-bit-control
***************
*** 239,245 ****
        mule-unicode-2500-33ff
        mule-unicode-e000-ffff)
       (mime-charset . utf-16le)
!      (coding-category . coding-category-utf-16-le)
       (valid-codes (0 . 255))
       (dependency unify-8859-on-encoding-mode
                 unify-8859-on-decoding-mode
--- 346,352 ----
        mule-unicode-2500-33ff
        mule-unicode-e000-ffff)
       (mime-charset . utf-16le)
!      (mime-text-unsuitable . t)
       (valid-codes (0 . 255))
       (dependency unify-8859-on-encoding-mode
                 unify-8859-on-decoding-mode
***************
*** 247,258 ****
                 utf-translate-cjk)))
  
    (make-coding-system
!    'mule-utf-16-be 4 ?u
     (concat
!     "Big endian UTF-16 encoding for Emacs-supported Unicode characters."
      doc)
  
!    '(ccl-decode-mule-utf-16-be . ccl-encode-mule-utf-16-be)
     '((safe-charsets
        ascii
        eight-bit-control
--- 354,365 ----
                 utf-translate-cjk)))
  
    (make-coding-system
!    'mule-utf-16be 4 ?u
     (concat
!     "UTF-16BE encoding for Emacs-supported Unicode characters."
      doc)
  
!    '(ccl-decode-mule-utf-16be . ccl-encode-mule-utf-16be)
     '((safe-charsets
        ascii
        eight-bit-control
***************
*** 261,274 ****
        mule-unicode-2500-33ff
        mule-unicode-e000-ffff)
       (mime-charset . utf-16be)
       (coding-category . coding-category-utf-16-be)
       (valid-codes (0 . 255))
       (dependency unify-8859-on-encoding-mode
                 unify-8859-on-decoding-mode
                 utf-fragment-on-decoding
!                utf-translate-cjk))))
  
! (define-coding-system-alias 'utf-16-le 'mule-utf-16-le)
! (define-coding-system-alias 'utf-16-be 'mule-utf-16-be)
  
  ;;; utf-16.el ends here
--- 368,464 ----
        mule-unicode-2500-33ff
        mule-unicode-e000-ffff)
       (mime-charset . utf-16be)
+      (valid-codes (0 . 255))
+      (dependency unify-8859-on-encoding-mode
+                unify-8859-on-decoding-mode
+                utf-fragment-on-decoding
+                utf-translate-cjk)))
+ 
+   (make-coding-system
+    'mule-utf-16le-with-signature 4 ?u
+    (concat
+     "Little endian UTF-16 (with BOM) for Emacs-supported Unicode characters."
+     doc)
+ 
+    '(ccl-decode-mule-utf-16le-with-signature
+      . ccl-encode-mule-utf-16le-with-signature)
+    '((safe-charsets
+       ascii
+       eight-bit-control
+       latin-iso8859-1
+       mule-unicode-0100-24ff
+       mule-unicode-2500-33ff
+       mule-unicode-e000-ffff)
+      (coding-category . coding-category-utf-16-le)
+      (mime-charset . utf-16)
+      (mime-text-unsuitable . t)
+      (valid-codes (0 . 255))
+      (dependency unify-8859-on-encoding-mode
+                unify-8859-on-decoding-mode
+                utf-fragment-on-decoding
+                utf-translate-cjk)))
+ 
+   (make-coding-system
+    'mule-utf-16be-with-signature 4 ?u
+    (concat
+     "Big endian UTF-16 (with BOM) for Emacs-supported Unicode characters."
+     doc)
+ 
+    '(ccl-decode-mule-utf-16be-with-signature
+      . ccl-encode-mule-utf-16be-with-signature)
+    '((safe-charsets
+       ascii
+       eight-bit-control
+       latin-iso8859-1
+       mule-unicode-0100-24ff
+       mule-unicode-2500-33ff
+       mule-unicode-e000-ffff)
       (coding-category . coding-category-utf-16-be)
+      (mime-charset . utf-16)
       (valid-codes (0 . 255))
       (dependency unify-8859-on-encoding-mode
                 unify-8859-on-decoding-mode
                 utf-fragment-on-decoding
!                utf-translate-cjk)))
  
!   (make-coding-system
!    'mule-utf-16 4 ?u
!    (concat
!     "UTF-16 (with or without BOM) for Emacs-supported Unicode characters."
!     doc)
! 
!    '(ccl-decode-mule-utf-16 . ccl-encode-mule-utf-16be-with-signature)
!    '((safe-charsets
!       ascii
!       eight-bit-control
!       latin-iso8859-1
!       mule-unicode-0100-24ff
!       mule-unicode-2500-33ff
!       mule-unicode-e000-ffff)
!      (coding-category . coding-category-utf-16-be)
!      (mime-charset . utf-16)
!      (mime-text-unsuitable . t)
!      (valid-codes (0 . 255))
!      (dependency unify-8859-on-encoding-mode
!                unify-8859-on-decoding-mode
!                utf-fragment-on-decoding
!                utf-translate-cjk)
!      (post-read-conversion . mule-utf-16-post-read-conversion)))
! )
! 
! (define-coding-system-alias 'utf-16le 'mule-utf-16le)
! (define-coding-system-alias 'utf-16be 'mule-utf-16be)
! (define-coding-system-alias 'utf-16le-with-signature
!   'mule-utf-16le-with-signature)
! (define-coding-system-alias 'utf-16be-with-signature
!   'mule-utf-16be-with-signature)
! (define-coding-system-alias 'utf-16 'mule-utf-16)
! 
! ;; For backward compatibility.
! (define-coding-system-alias 'mule-utf-16-le 'mule-utf-16le-with-signature)
! (define-coding-system-alias 'utf-16-le 'mule-utf-16le-with-signature)
! (define-coding-system-alias 'mule-utf-16-be 'mule-utf-16be-with-signature)
! (define-coding-system-alias 'utf-16-be 'mule-utf-16be-with-signature)
  
+ ;;; arch-tag: 85455d46-d9c9-466d-a6f3-c3582a7367c4
  ;;; utf-16.el ends here




reply via email to

[Prev in Thread] Current Thread [Next in Thread]