qemacs-commit
[Top][All Lists]
Advanced

[Date Prev][Date Next][Thread Prev][Thread Next][Date Index][Thread Index]

[Qemacs-commit] qemacs buffer.c charset.c charsetmore.c qe.h qe...


From: Charlie Gordon
Subject: [Qemacs-commit] qemacs buffer.c charset.c charsetmore.c qe.h qe...
Date: Mon, 10 Feb 2014 20:29:27 +0000

CVSROOT:        /sources/qemacs
Module name:    qemacs
Changes by:     Charlie Gordon <chqrlie>        14/02/10 20:29:27

Modified files:
        .              : buffer.c charset.c charsetmore.c qe.h qe.c 
                         qeconfig.h 

Log message:
        Improve charset detection and handling
        
        * add charset_raw for binary files
        * improve charset detection for ambiguous cases
        * add do_show_coding_system()
        * add do_set_auto_coding() to (re)select the best coding system
        * handle BOM mark: display as \ufeff and ignore for syntax coloring

CVSWeb URLs:
http://cvs.savannah.gnu.org/viewcvs/qemacs/buffer.c?cvsroot=qemacs&r1=1.72&r2=1.73
http://cvs.savannah.gnu.org/viewcvs/qemacs/charset.c?cvsroot=qemacs&r1=1.31&r2=1.32
http://cvs.savannah.gnu.org/viewcvs/qemacs/charsetmore.c?cvsroot=qemacs&r1=1.16&r2=1.17
http://cvs.savannah.gnu.org/viewcvs/qemacs/qe.h?cvsroot=qemacs&r1=1.139&r2=1.140
http://cvs.savannah.gnu.org/viewcvs/qemacs/qe.c?cvsroot=qemacs&r1=1.148&r2=1.149
http://cvs.savannah.gnu.org/viewcvs/qemacs/qeconfig.h?cvsroot=qemacs&r1=1.45&r2=1.46

Patches:
Index: buffer.c
===================================================================
RCS file: /sources/qemacs/qemacs/buffer.c,v
retrieving revision 1.72
retrieving revision 1.73
diff -u -b -r1.72 -r1.73
--- buffer.c    7 Feb 2014 15:56:15 -0000       1.72
+++ buffer.c    10 Feb 2014 20:29:26 -0000      1.73
@@ -498,6 +498,9 @@
 
     if (flags & BF_UTF8) {
         eb_set_charset(b, &charset_utf8, b->eol_type);
+    } else
+    if (flags & BF_RAW) {
+        eb_set_charset(b, &charset_raw, EOL_UNIX);
     } else {
         /* CG: default charset should be selectable */
         eb_set_charset(b, &charset_8859_1, b->eol_type);

Index: charset.c
===================================================================
RCS file: /sources/qemacs/qemacs/charset.c,v
retrieving revision 1.31
retrieving revision 1.32
diff -u -b -r1.31 -r1.32
--- charset.c   6 Feb 2014 00:19:39 -0000       1.31
+++ charset.c   10 Feb 2014 20:29:26 -0000      1.32
@@ -99,6 +99,37 @@
 };
 
 /********************************************************/
+/* raw */
+
+static void decode_raw_init(CharsetDecodeState *s)
+{
+    s->table = table_idem;
+}
+
+static u8 *encode_raw(__unused__ QECharset *charset, u8 *p, int c)
+{
+    if (c <= 0xff) {
+        *p++ = c;
+        return p;
+    } else {
+        return NULL;
+    }
+}
+
+QECharset charset_raw = {
+    "raw",
+    "binary|none",
+    decode_raw_init,
+    decode_8bit,
+    encode_raw,
+    charset_get_pos_8bit,
+    charset_get_chars_8bit,
+    charset_goto_char_8bit,
+    charset_goto_line_8bit,
+    1, 0, 0, 10, 0, 0, NULL, NULL,
+};
+
+/********************************************************/
 /* 8859-1 */
 
 static void decode_8859_1_init(CharsetDecodeState *s)
@@ -1072,7 +1103,7 @@
 /* detect the charset. Actually only UTF8 is detected */
 QECharset *detect_charset(const u8 *buf, int size, EOLType *eol_typep)
 {
-    int i, l, c, has_utf8;
+    int i, l, c, has_utf8, has_binary;
 
     has_utf8 = 0;
     for (i = 0; i < size;) {
@@ -1145,10 +1176,41 @@
 #endif
     /* Should detect iso-2220-jp upon \033$@ and \033$B, but jis
      * support is not selected in tiny build
+     * XXX: should use charset probe functions.
      */
-    /* CG: should use a state variable for default charset */
-    detect_eol_type_8bit(buf, size, &charset_8859_1, eol_typep);
+
+    has_binary = 0;
+    {
+        static const uint32_t magic = (1 << '\b') | (1 << '\t') | (1 << '\f') |
+                                      (1 << '\n') | (1 << '\r') | (1 << 
'\033') |
+                                      (1 << 0x0e) | (1 << 0x0f) | (1 << 0x1f);
+
+        for (i = 0; i < size; i++) {
+            c = buf[i];
+            if (c < 32 && !(magic & (1 << c)))
+                has_binary += 1;
+        }
+    }
+    if (has_binary) {
+        *eol_typep = EOL_UNIX;
+        return &charset_raw;
+    }
+
+    detect_eol_type_8bit(buf, size, &charset_raw, eol_typep);
+
+    if (*eol_typep == EOL_DOS) {
+        /* XXX: default DOS files to Latin1, should be selectable */
     return &charset_8859_1;
+    }
+#ifndef CONFIG_TINY
+    if (*eol_typep == EOL_MAC) {
+        /* XXX: default MAC files to Mac_roman, should be selectable */
+        /* XXX: should use probe functions */
+        return &charset_mac_roman;
+    }
+#endif
+    /* XXX: should use a state variable for default charset */
+    return &charset_utf8;
 }
 
 /********************************************************/
@@ -1347,6 +1409,7 @@
     for (i = 0xc0; i < 0xfe; i++)
         table_utf8[i] = ESCAPE_CHAR;
 
+    qe_register_charset(&charset_raw);
     qe_register_charset(&charset_8859_1);
     qe_register_charset(&charset_vt100);
     qe_register_charset(&charset_7bit);

Index: charsetmore.c
===================================================================
RCS file: /sources/qemacs/qemacs/charsetmore.c,v
retrieving revision 1.16
retrieving revision 1.17
diff -u -b -r1.16 -r1.17
--- charsetmore.c       24 Jan 2014 01:22:23 -0000      1.16
+++ charsetmore.c       10 Feb 2014 20:29:26 -0000      1.17
@@ -1159,7 +1159,7 @@
     0x00af, 0x02d8, 0x02d9, 0x02da, 0x00b8, 0x02dd, 0x02db, 0x02c7,
 };
 
-static QECharset charset_mac_roman = {
+QECharset charset_mac_roman = {
     "mac-roman",
     "x-mac|mac",
     decode_8bit_init,

Index: qe.h
===================================================================
RCS file: /sources/qemacs/qemacs/qe.h,v
retrieving revision 1.139
retrieving revision 1.140
diff -u -b -r1.139 -r1.140
--- qe.h        10 Feb 2014 20:10:32 -0000      1.139
+++ qe.h        10 Feb 2014 20:29:26 -0000      1.140
@@ -506,10 +506,14 @@
 };
 
 extern QECharset *first_charset;
-extern QECharset charset_utf8, charset_8859_1; /* predefined charsets */
+/* predefined charsets */
+extern QECharset charset_raw;
+extern QECharset charset_8859_1;
+extern QECharset charset_utf8;
 extern QECharset charset_vt100; /* used for the tty output */
 extern QECharset charset_ucs2le, charset_ucs2be;
 extern QECharset charset_ucs4le, charset_ucs4be;
+extern QECharset charset_mac_roman;
 
 typedef enum EOLType {
     EOL_UNIX = 0,
@@ -1760,6 +1764,8 @@
 void do_exchange_point_and_mark(EditState *s);
 QECharset *read_charset(EditState *s, const char *charset_str,
                         EOLType *eol_typep);
+void do_show_coding_system(EditState *s);
+void do_set_auto_coding(EditState *s, int verbose);
 void do_set_buffer_file_coding_system(EditState *s, const char *charset_str);
 void do_convert_buffer_file_coding_system(EditState *s,
     const char *charset_str);

Index: qe.c
===================================================================
RCS file: /sources/qemacs/qemacs/qe.c,v
retrieving revision 1.148
retrieving revision 1.149
diff -u -b -r1.148 -r1.149
--- qe.c        10 Feb 2014 20:10:32 -0000      1.148
+++ qe.c        10 Feb 2014 20:29:26 -0000      1.149
@@ -1875,6 +1875,31 @@
     return charset;
 }
 
+void do_show_coding_system(EditState *s)
+{
+    put_status(s, "Buffer charset is now %s%s", s->b->charset->name,
+               s->b->eol_type == EOL_DOS ? "-dos" :
+               s->b->eol_type == EOL_MAC ? "-mac" : "-unix");
+}
+
+void do_set_auto_coding(EditState *s, int verbose)
+{
+    u8 buf[4097];
+    int buf_size;
+    EditBuffer *b = s->b;
+    EOLType eol_type = b->eol_type;
+    QECharset *charset;
+
+    buf_size = eb_read(b, 0, buf, sizeof(buf));
+    eol_type = b->eol_type;
+    /* XXX: detect_charset returns a default charset */
+    charset = detect_charset(buf, buf_size, &eol_type);
+    eb_set_charset(b, charset, eol_type);
+    if (verbose) {
+        do_show_coding_system(s);
+    }
+}
+
 void do_set_buffer_file_coding_system(EditState *s, const char *charset_str)
 {
     QECharset *charset;
@@ -1885,7 +1910,7 @@
     if (!charset)
         return;
     eb_set_charset(s->b, charset, eol_type);
-    put_status(s, "Charset is now %s for this buffer", s->b->charset->name);
+    do_show_coding_system(s);
 }
 
 /* convert the charset of a buffer to another charset */
@@ -3242,7 +3267,7 @@
 int generic_get_colorized_line(EditState *s, unsigned int *buf, int buf_size,
                                int *offsetp, int line_num)
 {
-    int len, l, line, col, offset;
+    int len, l, line, col, offset, bom;
     int colorize_state;
 
     /* invalidate cache if needed */
@@ -3272,7 +3297,8 @@
 
         for (l = s->colorize_nb_valid_lines; l <= line_num; l++) {
             len = eb_get_line(s->b, buf, buf_size, &offset);
-            s->colorize_func(buf, len, &colorize_state, 1);
+            bom = (len > 0 && buf[0] == 0xFEFF);
+            s->colorize_func(buf + bom, len - bom, &colorize_state, 1);
             s->colorize_states[l] = colorize_state;
         }
     }
@@ -3280,7 +3306,8 @@
     /* compute line color */
     colorize_state = s->colorize_states[line_num];
     len = eb_get_line(s->b, buf, buf_size, offsetp);
-    s->colorize_func(buf, len, &colorize_state, 0);
+    bom = (len > 0 && buf[0] == 0xFEFF);
+    s->colorize_func(buf + bom, len - bom, &colorize_state, 0);
 
     /* XXX: if state is same as previous, minimize invalid region? */
     s->colorize_states[line_num + 1] = colorize_state;
@@ -3504,7 +3531,8 @@
                 /* currently, we cannot display these chars */
                 display_printf(ds, offset0, offset, "\\U%08x", c);
             } else
-            if (c >= 256 && s->qe_state->show_unicode == 1) {
+            if (c >= 256 && (s->qe_state->show_unicode == 1 || c == 0xfeff)) {
+                /* Display BOM as \uFEFF to make it explicit */
                 display_printf(ds, offset0, offset, "\\u%04x", c);
             } else {
                 display_char_bidir(ds, offset0, offset, embedding_level, c);

Index: qeconfig.h
===================================================================
RCS file: /sources/qemacs/qemacs/qeconfig.h,v
retrieving revision 1.45
retrieving revision 1.46
diff -u -b -r1.45 -r1.46
--- qeconfig.h  10 Feb 2014 20:10:32 -0000      1.45
+++ qeconfig.h  10 Feb 2014 20:29:26 -0000      1.46
@@ -381,6 +381,8 @@
     CMD2( KEY_NONE, KEY_NONE,
           "set-mode", do_set_mode, ESs,
           "s{Set mode: }[mode]")
+    CMD1( KEY_NONE, KEY_NONE,
+          "set-auto-coding", do_set_auto_coding, 1)
 
     /* tab & indent */
     CMD2( KEY_NONE, KEY_NONE,



reply via email to

[Prev in Thread] Current Thread [Next in Thread]