[Qemacs-commit] qemacs charset.c tests/NonBMP1.utf8 tests/TestP...

qemacs-commit
[Date Prev][Date Next][Thread Prev][Thread Next][Date Index][Thread Index]
[Qemacs-commit] qemacs charset.c tests/NonBMP1.utf8 tests/TestP...

From:	Charlie Gordon
Subject:	[Qemacs-commit] qemacs charset.c tests/NonBMP1.utf8 tests/TestP...
Date:	Sat, 15 Mar 2014 14:55:06 +0000
CVSROOT:        /sources/qemacs
Module name:    qemacs
Changes by:     Charlie Gordon <chqrlie>        14/03/15 14:55:06

Modified files:
        .              : charset.c 
Added files:
        tests          : NonBMP1.utf8 TestPage.ucs2be.txt 
                         TestPage.ucs2le.txt TestPage.ucs4be.txt 
                         TestPage.ucs4le.txt 

Log message:
        fix charset_detect
        
        * use charset probing functions to detect ucs2 and ucs4 charsets in 
files
          without a BOM (byte order mark)
        * add test files for multibyte charsets in tests directory

CVSWeb URLs:
http://cvs.savannah.gnu.org/viewcvs/qemacs/charset.c?cvsroot=qemacs&r1=1.33&r2=1.34
http://cvs.savannah.gnu.org/viewcvs/qemacs/tests/NonBMP1.utf8?cvsroot=qemacs&rev=1.1
http://cvs.savannah.gnu.org/viewcvs/qemacs/tests/TestPage.ucs2be.txt?cvsroot=qemacs&rev=1.1
http://cvs.savannah.gnu.org/viewcvs/qemacs/tests/TestPage.ucs2le.txt?cvsroot=qemacs&rev=1.1
http://cvs.savannah.gnu.org/viewcvs/qemacs/tests/TestPage.ucs4be.txt?cvsroot=qemacs&rev=1.1
http://cvs.savannah.gnu.org/viewcvs/qemacs/tests/TestPage.ucs4le.txt?cvsroot=qemacs&rev=1.1

Patches:
Index: charset.c
===================================================================
RCS file: /sources/qemacs/qemacs/charset.c,v
retrieving revision 1.33
retrieving revision 1.34
diff -u -b -r1.33 -r1.34
--- charset.c   1 Mar 2014 22:37:26 -0000       1.33
+++ charset.c   15 Mar 2014 14:55:05 -0000      1.34
@@ -1208,7 +1208,7 @@
 }
 
 /* detect the end of line type. */
-void detect_eol_type_8bit(const u8 *buf, int size,
+static void detect_eol_type_8bit(const u8 *buf, int size,
                           QECharset *charset, EOLType *eol_typep)
 {
     const u8 *p, *p1;
@@ -1259,7 +1259,7 @@
     *eol_typep = eol_type;
 }
 
-void detect_eol_type_16bit(const u8 *buf, int size,
+static void detect_eol_type_16bit(const u8 *buf, int size,
                            QECharset *charset, EOLType *eol_typep)
 {
     const uint16_t *p, *p1;
@@ -1317,7 +1317,7 @@
     *eol_typep = eol_type;
 }
 
-void detect_eol_type_32bit(const u8 *buf, int size,
+static void detect_eol_type_32bit(const u8 *buf, int size,
                            QECharset *charset, EOLType *eol_typep)
 {
     const uint32_t *p, *p1;
@@ -1375,6 +1375,20 @@
     *eol_typep = eol_type;
 }
 
+static QECharset *detect_eol_type(const u8 *buf, int size,
+                                  QECharset *charset, EOLType *eol_typep)
+{
+    if (charset->char_size == 4)
+        detect_eol_type_32bit(buf, size, charset, eol_typep);
+    else
+    if (charset->char_size == 4)
+        detect_eol_type_16bit(buf, size, charset, eol_typep);
+    else
+        detect_eol_type_8bit(buf, size, charset, eol_typep);
+
+    return charset;
+}
+
 QECharset *detect_charset(const u8 *buf, int size, EOLType *eol_typep)
 {
 #if 0
@@ -1428,8 +1442,7 @@
         }
     }
     if (has_utf8) {
-        detect_eol_type_8bit(buf, size, &charset_utf8, eol_typep);
-        return &charset_utf8;
+        return detect_eol_type(buf, size, &charset_utf8, eol_typep);
     }
 
     /* Check for zwnbsp BOM: files starting with zero-width
@@ -1438,23 +1451,19 @@
      */
     if (size >= 2 && buf[0] == 0xff && buf[1] == 0xfe) {
         if (size >= 4 && buf[2] == 0 && buf[3] == 0) {
-            detect_eol_type_32bit(buf, size, &charset_ucs4le, eol_typep);
-            return &charset_ucs4le;
+            return detect_eol_type(buf, size, &charset_ucs4le, eol_typep);
         } else {
-            detect_eol_type_16bit(buf, size, &charset_ucs2le, eol_typep);
-            return &charset_ucs2le;
+            return detect_eol_type(buf, size, &charset_ucs2le, eol_typep);
         }
     }
 
     if (size >= 2 && buf[0] == 0xfe && buf[1] == 0xff) {
-        detect_eol_type_16bit(buf, size, &charset_ucs2be, eol_typep);
-        return &charset_ucs2be;
+        return detect_eol_type(buf, size, &charset_ucs2be, eol_typep);
     }
 
     if (size >= 4
     &&  buf[0] == 0 && buf[1] == 0 && buf[2] == 0xfe && buf[3] == 0xff) {
-        detect_eol_type_32bit(buf, size, &charset_ucs4be, eol_typep);
-        return &charset_ucs4be;
+        return detect_eol_type(buf, size, &charset_ucs4be, eol_typep);
     }
 
 #if 0
@@ -1468,15 +1477,28 @@
                 maxc[i & 3] = buf[i];
         }
         if (maxc[0] > 'a' && maxc[1] < 0x2f && maxc[2] > 'a' && maxc[3] < 
0x2f) {
-            detect_eol_type_16bit(buf, size, &charset_ucs2le, eol_typep);
+            detect_eol_type(buf, size, &charset_ucs2le, eol_typep);
             return &charset_ucs2le;
         }
         if (maxc[1] > 'a' && maxc[0] < 0x2f && maxc[3] > 'a' && maxc[2] < 
0x2f) {
-            detect_eol_type_16bit(buf, size, &charset_ucs2be, eol_typep);
+            detect_eol_type(buf, size, &charset_ucs2be, eol_typep);
             return &charset_ucs2be;
         }
     }
+#else
+    if (charset_ucs4le.probe_func(&charset_ucs4le, buf, size))
+        return detect_eol_type(buf, size, &charset_ucs4le, eol_typep);
+    else
+    if (charset_ucs4be.probe_func(&charset_ucs4be, buf, size))
+        return detect_eol_type(buf, size, &charset_ucs4be, eol_typep);
+    else
+    if (charset_ucs2le.probe_func(&charset_ucs2le, buf, size))
+        return detect_eol_type(buf, size, &charset_ucs2le, eol_typep);
+    else
+    if (charset_ucs2be.probe_func(&charset_ucs2be, buf, size))
+        return detect_eol_type(buf, size, &charset_ucs2be, eol_typep);
 #endif
+
     /* Should detect iso-2220-jp upon \033$@ and \033$B, but jis
      * support is not selected in tiny build
      * XXX: should use charset probe functions.
@@ -1499,7 +1521,7 @@
         return &charset_raw;
     }
 
-    detect_eol_type_8bit(buf, size, &charset_raw, eol_typep);
+    detect_eol_type(buf, size, &charset_raw, eol_typep);
 
     if (*eol_typep == EOL_DOS) {
         /* XXX: default DOS files to Latin1, should be selectable */

Index: tests/NonBMP1.utf8
===================================================================
RCS file: tests/NonBMP1.utf8
diff -N tests/NonBMP1.utf8
--- /dev/null   1 Jan 1970 00:00:00 -0000
+++ tests/NonBMP1.utf8  15 Mar 2014 14:55:05 -0000      1.1
@@ -0,0 +1,7 @@
+************ Non BMP1 characters ****************
+ððððð¿¿
+ðð ñòó¿¿¿
+ôõö÷÷¿¿¿
+øøø ø¿¿¿¿
+ùúüü
+üü ýý¿¿¿¿¿

Index: tests/TestPage.ucs2be.txt
===================================================================
RCS file: tests/TestPage.ucs2be.txt
diff -N tests/TestPage.ucs2be.txt
Binary files /dev/null and /tmp/cvs9PqdKt differ

Index: tests/TestPage.ucs2le.txt
===================================================================
RCS file: tests/TestPage.ucs2le.txt
diff -N tests/TestPage.ucs2le.txt
Binary files /dev/null and /tmp/cvsmd7zos differ

Index: tests/TestPage.ucs4be.txt
===================================================================
RCS file: tests/TestPage.ucs4be.txt
diff -N tests/TestPage.ucs4be.txt
Binary files /dev/null and /tmp/cvsjBTO6r differ

Index: tests/TestPage.ucs4le.txt
===================================================================
RCS file: tests/TestPage.ucs4le.txt
diff -N tests/TestPage.ucs4le.txt
Binary files /dev/null and /tmp/cvsq42NYs differ
[Prev in Thread]
Current Thread
[Next in Thread]
[Qemacs-commit] qemacs charset.c tests/NonBMP1.utf8 tests/TestP..., Charlie Gordon <=
Prev by Date: [Qemacs-commit] qemacs qe.c
Next by Date: [Qemacs-commit] qemacs/tests TestPage.txt
Previous by thread: [Qemacs-commit] qemacs makemode.c
Next by thread: [Qemacs-commit] qemacs/tests TestPage.txt
Index(es):
- Date
- Thread