bug-gnu-emacs
[Top][All Lists]
Advanced

[Date Prev][Date Next][Thread Prev][Thread Next][Date Index][Thread Index]

[gnu.emacs.bug] Chinese GBK encoding support requested


From: Gerd Moellmann
Subject: [gnu.emacs.bug] Chinese GBK encoding support requested
Date: 29 Oct 2001 14:03:22 +0100
User-agent: Gnus/5.09 (Gnus v5.9.0) Emacs/21.1.50

Kenichi, could you please handle this one?

--- Begin Message --- Subject: Chinese GBK encoding support requested Date: 28 Oct 2001 23:46:46 +0800 User-agent: Gnus/5.09 (Gnus v5.9.0) Emacs/21.1
Hi,

Is there any plan to support Chinese GBK encoding?  This coding is
currently used widely in Simplified Chinese Platform and is the
defacto encoding scheme in use (at least before unicode based codings
finally gains its popularity).  GBK covers a larger charset than
GB2312, including the name of Chinese Prime Minister, Zhu Rong-Ji,
which cannot be encoded in GB2312.

Unfortunately, Emacs currently only support GB2312, BIG5, and some
other (rarely used?) CNS codings, so it would be inconvient for
chinese processing.

I have made a 'dirty' patch to deal with GBK encoding, which I think
might be useful if the developers decide to add the support.  It's
'dirty' because it disables a charset, chinese-cns11643-7, to define
my own ones.  In fact, since GBK is a superset of GB2312, I can reuse
chinese-gb2312, but I need to define two more charsets and there are
only one unused charset left.

After that, I implement the encoder and decoder functions:
ccl-decode-gbk-char, ccl-encode-gbk-char, and ccl-encode-gbk-font.
So the key components are there.

The patch works fine on my system, though I'm not very sure about the
use of charset category, etc.  Also, I think it'd be better if we can
make it without disabling any existing charset.  A MULE guru may be
able to help and find out what's being left out or incorrect in my
patch.

Any comments are welcome.

Yong LU



Part I: Patch to disable chinese-cns11643-7

---------8<---------8<---------o--------->8--------->8--------->8------
diff -subr emacs-21.0.104/lisp/gnus/mm-util.el 
emacs-21.0.104.gbk/lisp/gnus/mm-util.el
--- emacs-21.0.104/lisp/gnus/mm-util.el Tue Mar  6 18:32:07 2001
+++ emacs-21.0.104.gbk/lisp/gnus/mm-util.el     Wed Aug 22 14:58:28 2001
@@ -74,8 +74,8 @@
                    korean-ksc5601 japanese-jisx0212
                    chinese-cns11643-1 chinese-cns11643-2
                    chinese-cns11643-3 chinese-cns11643-4
-                   chinese-cns11643-5 chinese-cns11643-6
-                   chinese-cns11643-7)
+                   chinese-cns11643-5 chinese-cns11643-6)
+;;                 chinese-cns11643-7)
     ;; utf-8 comes either from Mule-UCS or Mule 5+.
     ,@(if (mm-coding-system-p 'utf-8)
          (list (cons 'utf-8 (delete 'ascii
diff -subr emacs-21.0.104/lisp/international/characters.el 
emacs-21.0.104.gbk/lisp/international/characters.el
--- emacs-21.0.104/lisp/international/characters.el     Fri Mar  9 18:23:38 2001
+++ emacs-21.0.104.gbk/lisp/international/characters.el Wed Aug 22 15:00:16 2001
@@ -167,8 +167,8 @@
                  chinese-cns11643-3
                  chinese-cns11643-4
                  chinese-cns11643-5
-                 chinese-cns11643-6
-                 chinese-cns11643-7))
+                 chinese-cns11643-6))
+;;               chinese-cns11643-7))
       generic-char)
   (while cns-list
     (setq generic-char (make-char (car cns-list)))
@@ -802,7 +802,7 @@
           (chinese-cns11643-4  . iso-2022-cn)
           (chinese-cns11643-5  . iso-2022-cn)
           (chinese-cns11643-6  . iso-2022-cn)
-          (chinese-cns11643-7  . iso-2022-cn)
+;;        (chinese-cns11643-7  . iso-2022-cn)
           (indian-2-column     . devanagari)
           (tibetan             . tibetan)
           (latin-iso8859-14    . iso-latin-8)
diff -subr emacs-21.0.104/lisp/international/fontset.el 
emacs-21.0.104.gbk/lisp/international/fontset.el
--- emacs-21.0.104/lisp/international/fontset.el        Mon Feb 26 16:59:42 2001
+++ emacs-21.0.104.gbk/lisp/international/fontset.el    Wed Aug 22 15:00:10 2001
@@ -60,7 +60,7 @@
           (chinese-cns11643-4 . ("*" . "CNS11643.1992-4"))
           (chinese-cns11643-5 . ("*" . "CNS11643.1992-5"))
           (chinese-cns11643-6 . ("*" . "CNS11643.1992-6"))
-          (chinese-cns11643-7 . ("*" . "CNS11643.1992-7"))
+;;        (chinese-cns11643-7 . ("*" . "CNS11643.1992-7"))
           (chinese-big5-1 . ("*" . "Big5"))
           (chinese-big5-2 . ("*" . "Big5"))
           (chinese-sisheng . (nil . "sisheng_cwnn"))
@@ -514,8 +514,8 @@
        chinese-cns11643-3:-*-medium-r-normal-*-16-*-cns11643*-3,
        chinese-cns11643-4:-*-medium-r-normal-*-16-*-cns11643*-4,
        chinese-cns11643-5:-*-medium-r-normal-*-16-*-cns11643*-5,
-       chinese-cns11643-6:-*-medium-r-normal-*-16-*-cns11643*-6,
-       chinese-cns11643-7:-*-medium-r-normal-*-16-*-cns11643*-7")
+       chinese-cns11643-6:-*-medium-r-normal-*-16-*-cns11643*-6,")
+;;     chinese-cns11643-7:-*-medium-r-normal-*-16-*-cns11643*-7")
   "String of fontset spec of the standard fontset.
 You have the biggest chance to display international characters
 with correct glyphs by using the standard fontset.
diff -subr emacs-21.0.104/lisp/international/mule-conf.el 
emacs-21.0.104.gbk/lisp/international/mule-conf.el
--- emacs-21.0.104/lisp/international/mule-conf.el      Fri Mar  9 18:23:38 2001
+++ emacs-21.0.104.gbk/lisp/international/mule-conf.el  Wed Aug 22 15:01:12 2001
@@ -236,9 +236,9 @@
 (define-charset 249 'chinese-cns11643-6
   [2 94 2 0 ?L 0 "CNS11643-6" "CNS11643-6 (Chinese traditional): ISO-IR-186"
      "CNS11643 Plane 6 Chinese Traditional: ISO-IR-186"])
-(define-charset 250 'chinese-cns11643-7
-  [2 94 2 0 ?M 0 "CNS11643-7" "CNS11643-7 (Chinese traditional): ISO-IR-187"
-     "CNS11643 Plane 7 Chinese Traditional: ISO-IR-187"])
+;; (define-charset 250 'chinese-cns11643-7
+;;   [2 94 2 0 ?M 0 "CNS11643-7" "CNS11643-7 (Chinese traditional): ISO-IR-187"
+;;      "CNS11643 Plane 7 Chinese Traditional: ISO-IR-187"])
 
 ;; Actual Glyph for 2-column width.
 (define-charset 251 'indian-2-column
@@ -373,13 +373,13 @@
    (nil korean-ksc5601 chinese-gb2312 chinese-cns11643-1 t)
    (nil chinese-cns11643-2)
    (nil chinese-cns11643-3 chinese-cns11643-4 chinese-cns11643-5
-       chinese-cns11643-6 chinese-cns11643-7)
+       chinese-cns11643-6) ;; chinese-cns11643-7)
    short ascii-eol ascii-cntl seven locking-shift single-shift nil nil nil
    init-bol)
  '((safe-charsets ascii japanese-jisx0208 japanese-jisx0208-1978 latin-jisx0201
                  korean-ksc5601 chinese-gb2312 chinese-cns11643-1
                  chinese-cns11643-2 chinese-cns11643-3 chinese-cns11643-4
-                 chinese-cns11643-5 chinese-cns11643-6 chinese-cns11643-7)
+                 chinese-cns11643-5 chinese-cns11643-6) ;; chinese-cns11643-7)
    (composition . t)))
 
 (define-coding-system-alias 'iso-2022-cjk 'iso-2022-7bit-lock-ss2)
diff -subr emacs-21.0.104/lisp/language/chinese.el 
emacs-21.0.104.gbk/lisp/language/chinese.el
--- emacs-21.0.104/lisp/language/chinese.el     Wed Jan 24 22:50:08 2001
+++ emacs-21.0.104.gbk/lisp/language/chinese.el Wed Aug 22 14:57:29 2001
@@ -54,12 +54,12 @@
    (nil chinese-gb2312 chinese-cns11643-1)
    (nil chinese-cns11643-2)
    (nil chinese-cns11643-3 chinese-cns11643-4 chinese-cns11643-5
-       chinese-cns11643-6 chinese-cns11643-7)
+       chinese-cns11643-6) ;; chinese-cns11643-7)
    nil ascii-eol ascii-cntl seven locking-shift single-shift nil nil nil
    init-bol)
  '((safe-charsets ascii chinese-gb2312 chinese-cns11643-1 chinese-cns11643-2
                  chinese-cns11643-3 chinese-cns11643-4 chinese-cns11643-5
-                 chinese-cns11643-6 chinese-cns11643-7)
+                 chinese-cns11643-6) ;; chinese-cns11643-7)
    (mime-charset . iso-2022-cn-ext)))
 

@@ -171,8 +171,8 @@
 (set-language-info-alist
  "Chinese-CNS" '((charset chinese-cns11643-1 chinese-cns11643-2
                          chinese-cns11643-3 chinese-cns11643-4
-                         chinese-cns11643-5 chinese-cns11643-6
-                         chinese-cns11643-7)
+                         chinese-cns11643-5 chinese-cns11643-6)
+;;                       chinese-cns11643-7)
                 (coding-system iso-2022-cn)
                 (coding-priority iso-2022-cn chinese-big5 chinese-iso-8bit)
                 (features china-util)
diff -subr emacs-21.0.104/lisp/ps-mule.el emacs-21.0.104.gbk/lisp/ps-mule.el
--- emacs-21.0.104/lisp/ps-mule.el      Thu Apr  5 17:45:08 2001
+++ emacs-21.0.104.gbk/lisp/ps-mule.el  Wed Aug 22 14:56:40 2001
@@ -411,8 +411,8 @@
      (normal bdf ("cns5-40.bdf" "cns-5-40.bdf") ps-mule-encode-7bit 2))
     (chinese-cns11643-6
      (normal bdf ("cns6-40.bdf" "cns-6-40.bdf") ps-mule-encode-7bit 2))
-    (chinese-cns11643-7
-     (normal bdf ("cns7-40.bdf" "cns-7-40.bdf") ps-mule-encode-7bit 2))
+;;     (chinese-cns11643-7
+;;      (normal bdf ("cns7-40.bdf" "cns-7-40.bdf") ps-mule-encode-7bit 2))
     (indian-2-column
      (normal bdf ("ind24-mule.bdf" "mule-indian-24.bdf") ps-mule-encode-7bit 
2))
     (tibetan
---------8<---------8<---------o--------->8--------->8--------->8------


Part II: GBK encoders and decoders
---------8<---------8<---------o--------->8--------->8--------->8------
;;; chinese-gbk.el --- Support for Chinese GBK
;;
;; Author: Yong Lu <l y o n g u @ y a h o o . c o m>
;;
;; Date: Aug 22, 2001
;;

;; This program is distributed in the hope that it will be useful,
;; but WITHOUT ANY WARRANTY; without even the implied warranty of
;; MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
;; GNU General Public License for more details.

;; You should have received a copy of the GNU General Public License along with
;; CCE; see the file COPYING.  If not, write to the Free Software Foundation, 
;; 675 Mass Ave, Cambridge, MA 02139, USA.

;;; Commentary:

;; GBK includes the character set GB2312

;; Please refer to mule-conf.el and other source files in
;; ./lisp/{language,international} and ./src for more information.
;;

;;
;; define gbk specific charsets.
;;

;; ;;
;; ;; chinese-gbk-0:
;; ;;    Range            Words          Marks
;; ;;    -------------------------------------------------------
;; ;;    A1A1-A9FE        846            GB2312, GB12345 (GBK/1)
;; ;;    AAA1-AFFE        564            User defined 1
;; ;;    B0A1-F7FE        6768           GB2312 (GBK/2)
;; ;;    F8A1-FEFE        658            User defined 2
;; ;;
;; (define-charset 249 'chinese-gbk-0
;;   [2 94 2 0 ?0 0 "GBK (Level-1)" "GBK (Level-1) A1A1-F7FE"
;;      "GB2312, GB12345, Big5 and Symbols Part (A1A1-F7FE) of GBK (Chinese 
Simplified)"])

;;
;; chinese-gbk-1:
;;    Range            Words          Marks
;;    ------------------------------------------------------
;;    8140-A0FE        6080           GB13000 (GBK/3)
;;
(define-charset 250 'chinese-gbk-1
  [2 94 2 0 ?1 0 "GBK (Level-2)" "GBK (Level-1) 8140-A0FE"
     "GB13000 Part 1 (8140-A0FE) of GBK (Chinese Simplified)"])

;;
;; chinese-gbk-2:
;;    Range            Words          Marks
;;    -----------------------------------------------------
;;
;;    A140-A7A0        672            User defined 3
;;    A840-A940        192            Big5, Symbols (GBK/5)
;;    AA40-FEA0        8160           GBK13000 (GBK/4)
;;
(define-charset 253 'chinese-gbk-2
  [2 96 2 0 ?2 0 "GBK (Level-3)" "GBK (Level-3) A140-FEA0"
     "GB13000 Part 2 (A140-FEA0) of GBK (Chinese Simplified)"])

;;
;; Include user-define regions???
;;

;;
;; Modify charset categories.  Improve it later.
;;
(let (;;(generic-gbk-0-char (make-char 'chinese-gbk-0))
      (generic-gbk-1-char (make-char 'chinese-gbk-1))
      (generic-gbk-2-char (make-char 'chinese-gbk-2)))
;;  (modify-syntax-entry generic-gbk-0-char "w")
  (modify-syntax-entry generic-gbk-1-char "w")
  (modify-syntax-entry generic-gbk-2-char "w")

;;  (modify-category-entry generic-gbk-0-char ?c)
  (modify-category-entry generic-gbk-1-char ?c)
  (modify-category-entry generic-gbk-2-char ?c)

;;  (modify-category-entry generic-gbk-0-char ?C)
  (modify-category-entry generic-gbk-1-char ?C)
  (modify-category-entry generic-gbk-2-char ?C)

;;  (modify-category-entry generic-gbk-0-char ?\|)
  (modify-category-entry generic-gbk-1-char ?\|)
  (modify-category-entry generic-gbk-2-char ?\|))


;;
;; Coding category.  Any use?
;;
;;(setq coding-category-gbk 'chinese-gbk)


;; (cons (gbk
;;        (ascii chinese-gbk-0 chinese-gbk-1 chinese-gbk-2)
;;        ccl-decode-gbk-char
;;        ((32 127)
;;      ((?\x81 ?\xFE) . (?\x40 ?\x7E ?\x80 ?\xFE))))
;;       non-iso-charset-alist)


;;=============================;;
;;
;; ccl coder/decoder for gbk
;;
;;=============================;;
(define-ccl-program ccl-decode-gbk-char
  `(2
    ((r3 = ,(charset-id 'eight-bit-control))
     (r4 = ,(charset-id 'eight-bit-graphic))
     (loop
      (read r0)
      
      (loop
      ;; 1 byte encoding, i.e., ascii
       (if (r0 < #x80)
           ((write r0)
            (break))

         (if (r0 == #x80)
             ((write-multibyte-character r3 r0)
              (break))
           (r0 = r0)))
       
       ;; maybe 2-byte sequence
       (read r1)
       (if (r0 < #xa1)
           (if (r1 < #x40)
               ;; invalid 2-byte sequence
               ((if (r0 < #xa0)
                    (write-multibyte-character r3 r0)
                  (write-multibyte-character r4 r0))
                (r0 = r1)
                (repeat))
             (if (r1 == #x7f)
                 ;; invalid 2-byte sequence
                 ((if (r0 < #xa0)
                      (write-multibyte-character r3 r0)
                    (write-multibyte-character r4 r0))
                  (r0 = r1)
                  (repeat))
               (if (r1 == #xff)
                   ;; invalid 2-byte sequence
                   ((if (r0 < #xa0)
                        (write-multibyte-character r3 r0)
                      (write-multibyte-character r4 r0))
                    (r0 = r1)
                    (repeat))
                 ;; chinese-gbk-1
                 ((if (r1 < #x80)
                      (r1 -= #x40)
                    (r1 -= #x41))
                  (r2 = (((r0 - #x81) * 190) + r1))
                  (r0 = (((r2 / 94) + 33) << 7))
                  (r0 += ((r2 % 94) + 33))
                  (r5 = ,(charset-id 'chinese-gbk-1))
                  (write-multibyte-character r5 r0)
                  (break)))))
         
         ;; r0 >= #xa1
         ((if (r1 < #x40)
              ;; invalid 2-byte sequence
              ((write-multibyte-character r4 r0)
               (r0 = r1)
               (repeat))
            
            (if (r1 == #x7f)
                ;; invalid 2-byte sequence
                ((write-multibyte-character r4 r0)
                 (r0 = r1)
                 (repeat))
              
              (if (r1 == #xff)
                  ;; invalid 2-byte sequence
                  ((write-multibyte-character r4 r0)
                   (r0 = r1)
                   (repeat))
                
                (if (r1 <= #xa0)
                    ;; chinese-gbk-2
                    ((if (r1 >= #x80)
                         (r1 -= 1)
                       (r1 = r1))
                     (r0 = (((r0 - #xa0) + 32) << 7))
                     (r0 += ((r1 - #x40) + 32))
                     (r5 = ,(charset-id 'chinese-gbk-2))
                     (write-multibyte-character r5 r0)
                     (break))
                  
                  ;; chinese-gb2312
                  ((r0 = (((r0 - #xa0) + 32) << 7))
                   (r0 += ((r1 - #xa0) + 32))
                   (r5 = ,(charset-id 'chinese-gb2312))
                   (write-multibyte-character r5 r0)
                   (break)))))))))
      (repeat))))
  "CCL GBK decoder.")


(define-ccl-program ccl-encode-gbk-char
  `(1
    (loop
     (read-multibyte-character r0 r1)
     (if (r0 == ,(charset-id 'ascii))
         ((write r1)
          (repeat))
       (if (r0 == ,(charset-id 'eight-bit-control))
           ((write r1)
            (repeat))
         (if (r0 == ,(charset-id 'eight-bit-graphic))
             ((write r1)
              (repeat))
           (r1 = r1))))
           
     ;; chinese-gb2312
     (if (r0 == ,(charset-id 'chinese-gb2312))
         ((r2 = (((r1 & #x7f) - 32) + #xa0))
          (r1 = (((r1 >> 7) - 32) + #xa0))
          (write r1 r2))
       
       ;; chinese-gbk-1
       (if (r0 == ,(charset-id 'chinese-gbk-1))
           ((r2 = ((r1 & #x7f) - 33))
            (r1 = ((r1 >> 7) - 33))
            (r3 = ((r1 * 94) + r2))
            (r1 = ((r3 / 190) + #x81))
            (r2 = (r3 % 190))
            (if (r2 >= #x3f)
                (r2 += #x41)
              (r2 += #x40))
            (write r1 r2))
         
         ;; chinese-gbk-2
         (if (r0 == ,(charset-id 'chinese-gbk-2))
             (
              (r2 = (((r1 & #x7f) - 32) + #x40))
              (r1 = (((r1 >> 7) - 32) + #xa0))
              (if (r2 >= #x7f)
                  (r2 += 1)
                (r2 += 0))
              (write r1 r2))
           (repeat))))
     (repeat)))
  "CCC GBK encoder.")

(define-ccl-program ccl-encode-gbk-font
  `(0
    ;; In:  R0:chinese-gb2312, chinese-gbk-1 or chinese-gbk-2
    ;;      R1:position code 1
    ;;      R2:position code 2
    ;; Out: R1:font code point 1
    ;;      R2:font code point 2
    (
     ;; chinese-gb2312
     (if (r0 == ,(charset-id 'chinese-gb2312))
         ((r1 = ((r1 - 32) + #xa0))
          (r2 = ((r2 - 32) + #xa0)))
       ;; chinese-gbk-1
       (if (r0 == ,(charset-id 'chinese-gbk-1))
           ((r1 = (r1 - 33))
            (r2 = (r2 - 33))
            (r3 = ((r1 * 94) + r2))
            (r1 = ((r3 / 190) + #x81))
            (r2 = (r3 % 190))
            (if (r2 >= #x3f)
                (r2 += #x41)
              (r2 += #x40)))
         
         ;; chinese-gbk-2
         ((r1 = ((r1 - 32) + #xa0))
          (r2 = ((r2 - 32) + #x40))
          (if (r2 >= #x7f)
              (r2 += 1)
            (r2 += 0)))))))
  "CCL program to encode a GBK code to code point of GBK font.")


(setq font-ccl-encoder-alist
      (cons (cons "gbk" ccl-encode-gbk-font) font-ccl-encoder-alist))


;;
;; Make GBK coding system
;; mime_charset?
;;
(make-coding-system
 'chinese-gbk 4 ?Z "GBK 8-bit encoding for Chinese"
 '(ccl-decode-gbk-char . ccl-encode-gbk-char)
 '((safe-charsets ascii chinese-gb2312 chinese-gbk-1 chinese-gbk-2
                  chinese-big5-1 chinese-big5-2)
   (valid-codes (0 . 255))))
;   (charset-origin-alist (chinese-gbk-0 "GBK" ccl-encode-gbk-char)
;                        (chinese-gbk-1 "GBK" ccl-encode-gbk-char)
;                        (chinese-gbk-2 "GBK" ccl-encode-gbk-char))))

(define-coding-system-alias 'gbk 'chinese-gbk)
(define-coding-system-alias 'cn-gbk 'chinese-gbk)

(update-coding-systems-internal)

;;
;; fontset (see lisp/international/fontset.el)
;;
(let ((l `((chinese-gb2312 . (nil . "GBK"))
           (chinese-gbk-1 . (nil . "GBK"))
           (chinese-gbk-2 . (nil . "GBK"))))
      charset font-spec arg)
  (while l
    (setq charset (car (car l)) font-spec (cdr (car l)) l (cdr l))
    (if (symbolp charset)
        (setq arg (make-char charset))
      (setq arg charset))
    (set-fontset-font "fontset-default" arg font-spec)))

;; Setting for suppressing XLoadQueryFont on big fonts.
(setq x-pixel-size-width-font-regexp
      "gbk\\|gb2312\\|jisx0208\\|ksc5601\\|cns11643\\|big5")

;; These fonts require vertical centering.
(setq vertical-centering-font-regexp
      "gbk\\|gb2312\\|jisx0208\\|jisx0212\\|ksc5601\\|cns11643\\|big5")

;;
;; kbd handler for gbk
;;
(defun encoded-kbd-self-insert-gbk ()
  (interactive)
  (let ((char (ccl-decode-gbk-char (+ (ash last-command-char 8)
                                   (read-char-exclusive)))))
    (setq unread-command-events (cons char unread-command-events))))



;;
;; GBK language environment
;;
(set-language-info-alist
 "Chinese-GBK" '((charset chinese-gb2312 chinese-gbk-1 chinese-gbk-2
                          chinese-sisheng chinese-big5-1 chinese-big5-2)
                (coding-system chinese-gbk chinese-iso-8bit chinese-big5
                               iso-2022-cn chinese-hz)
                (coding-priority chinese-gbk chinese-iso-8bit chinese-big5
                                 iso-2022-cn)
                (input-method . "chinese-py-punct")
                (features china-util)
                (sample-text . "Chinese GBK (中文,普通话,汉语) 你好")
                (documentation . "Support for Chinese GBK character set."))
 '("Chinese"))

(provide 'gbk)

;;; gbk.el ends here
---------8<---------8<---------o--------->8--------->8--------->8------

_______________________________________________
Bug-gnu-emacs mailing list
Bug-gnu-emacs@gnu.org
http://mail.gnu.org/mailman/listinfo/bug-gnu-emacs


--- End Message ---

reply via email to

[Prev in Thread] Current Thread [Next in Thread]