[Date Prev][Date Next][Thread Prev][Thread Next][Date Index][Thread Index]
[Qemacs-commit] qemacs charset.c tests/NonBMP1.utf8 tests/TestP...
From: |
Charlie Gordon |
Subject: |
[Qemacs-commit] qemacs charset.c tests/NonBMP1.utf8 tests/TestP... |
Date: |
Sat, 15 Mar 2014 14:55:06 +0000 |
CVSROOT: /sources/qemacs
Module name: qemacs
Changes by: Charlie Gordon <chqrlie> 14/03/15 14:55:06
Modified files:
. : charset.c
Added files:
tests : NonBMP1.utf8 TestPage.ucs2be.txt
TestPage.ucs2le.txt TestPage.ucs4be.txt
TestPage.ucs4le.txt
Log message:
fix charset_detect
* use charset probing functions to detect ucs2 and ucs4 charsets in
files
without a BOM (byte order mark)
* add test files for multibyte charsets in tests directory
CVSWeb URLs:
http://cvs.savannah.gnu.org/viewcvs/qemacs/charset.c?cvsroot=qemacs&r1=1.33&r2=1.34
http://cvs.savannah.gnu.org/viewcvs/qemacs/tests/NonBMP1.utf8?cvsroot=qemacs&rev=1.1
http://cvs.savannah.gnu.org/viewcvs/qemacs/tests/TestPage.ucs2be.txt?cvsroot=qemacs&rev=1.1
http://cvs.savannah.gnu.org/viewcvs/qemacs/tests/TestPage.ucs2le.txt?cvsroot=qemacs&rev=1.1
http://cvs.savannah.gnu.org/viewcvs/qemacs/tests/TestPage.ucs4be.txt?cvsroot=qemacs&rev=1.1
http://cvs.savannah.gnu.org/viewcvs/qemacs/tests/TestPage.ucs4le.txt?cvsroot=qemacs&rev=1.1
Patches:
Index: charset.c
===================================================================
RCS file: /sources/qemacs/qemacs/charset.c,v
retrieving revision 1.33
retrieving revision 1.34
diff -u -b -r1.33 -r1.34
--- charset.c 1 Mar 2014 22:37:26 -0000 1.33
+++ charset.c 15 Mar 2014 14:55:05 -0000 1.34
@@ -1208,7 +1208,7 @@
}
/* detect the end of line type. */
-void detect_eol_type_8bit(const u8 *buf, int size,
+static void detect_eol_type_8bit(const u8 *buf, int size,
QECharset *charset, EOLType *eol_typep)
{
const u8 *p, *p1;
@@ -1259,7 +1259,7 @@
*eol_typep = eol_type;
}
-void detect_eol_type_16bit(const u8 *buf, int size,
+static void detect_eol_type_16bit(const u8 *buf, int size,
QECharset *charset, EOLType *eol_typep)
{
const uint16_t *p, *p1;
@@ -1317,7 +1317,7 @@
*eol_typep = eol_type;
}
-void detect_eol_type_32bit(const u8 *buf, int size,
+static void detect_eol_type_32bit(const u8 *buf, int size,
QECharset *charset, EOLType *eol_typep)
{
const uint32_t *p, *p1;
@@ -1375,6 +1375,20 @@
*eol_typep = eol_type;
}
+static QECharset *detect_eol_type(const u8 *buf, int size,
+ QECharset *charset, EOLType *eol_typep)
+{
+ if (charset->char_size == 4)
+ detect_eol_type_32bit(buf, size, charset, eol_typep);
+ else
+ if (charset->char_size == 4)
+ detect_eol_type_16bit(buf, size, charset, eol_typep);
+ else
+ detect_eol_type_8bit(buf, size, charset, eol_typep);
+
+ return charset;
+}
+
QECharset *detect_charset(const u8 *buf, int size, EOLType *eol_typep)
{
#if 0
@@ -1428,8 +1442,7 @@
}
}
if (has_utf8) {
- detect_eol_type_8bit(buf, size, &charset_utf8, eol_typep);
- return &charset_utf8;
+ return detect_eol_type(buf, size, &charset_utf8, eol_typep);
}
/* Check for zwnbsp BOM: files starting with zero-width
@@ -1438,23 +1451,19 @@
*/
if (size >= 2 && buf[0] == 0xff && buf[1] == 0xfe) {
if (size >= 4 && buf[2] == 0 && buf[3] == 0) {
- detect_eol_type_32bit(buf, size, &charset_ucs4le, eol_typep);
- return &charset_ucs4le;
+ return detect_eol_type(buf, size, &charset_ucs4le, eol_typep);
} else {
- detect_eol_type_16bit(buf, size, &charset_ucs2le, eol_typep);
- return &charset_ucs2le;
+ return detect_eol_type(buf, size, &charset_ucs2le, eol_typep);
}
}
if (size >= 2 && buf[0] == 0xfe && buf[1] == 0xff) {
- detect_eol_type_16bit(buf, size, &charset_ucs2be, eol_typep);
- return &charset_ucs2be;
+ return detect_eol_type(buf, size, &charset_ucs2be, eol_typep);
}
if (size >= 4
&& buf[0] == 0 && buf[1] == 0 && buf[2] == 0xfe && buf[3] == 0xff) {
- detect_eol_type_32bit(buf, size, &charset_ucs4be, eol_typep);
- return &charset_ucs4be;
+ return detect_eol_type(buf, size, &charset_ucs4be, eol_typep);
}
#if 0
@@ -1468,15 +1477,28 @@
maxc[i & 3] = buf[i];
}
if (maxc[0] > 'a' && maxc[1] < 0x2f && maxc[2] > 'a' && maxc[3] <
0x2f) {
- detect_eol_type_16bit(buf, size, &charset_ucs2le, eol_typep);
+ detect_eol_type(buf, size, &charset_ucs2le, eol_typep);
return &charset_ucs2le;
}
if (maxc[1] > 'a' && maxc[0] < 0x2f && maxc[3] > 'a' && maxc[2] <
0x2f) {
- detect_eol_type_16bit(buf, size, &charset_ucs2be, eol_typep);
+ detect_eol_type(buf, size, &charset_ucs2be, eol_typep);
return &charset_ucs2be;
}
}
+#else
+ if (charset_ucs4le.probe_func(&charset_ucs4le, buf, size))
+ return detect_eol_type(buf, size, &charset_ucs4le, eol_typep);
+ else
+ if (charset_ucs4be.probe_func(&charset_ucs4be, buf, size))
+ return detect_eol_type(buf, size, &charset_ucs4be, eol_typep);
+ else
+ if (charset_ucs2le.probe_func(&charset_ucs2le, buf, size))
+ return detect_eol_type(buf, size, &charset_ucs2le, eol_typep);
+ else
+ if (charset_ucs2be.probe_func(&charset_ucs2be, buf, size))
+ return detect_eol_type(buf, size, &charset_ucs2be, eol_typep);
#endif
+
/* Should detect iso-2220-jp upon \033$@ and \033$B, but jis
* support is not selected in tiny build
* XXX: should use charset probe functions.
@@ -1499,7 +1521,7 @@
return &charset_raw;
}
- detect_eol_type_8bit(buf, size, &charset_raw, eol_typep);
+ detect_eol_type(buf, size, &charset_raw, eol_typep);
if (*eol_typep == EOL_DOS) {
/* XXX: default DOS files to Latin1, should be selectable */
Index: tests/NonBMP1.utf8
===================================================================
RCS file: tests/NonBMP1.utf8
diff -N tests/NonBMP1.utf8
--- /dev/null 1 Jan 1970 00:00:00 -0000
+++ tests/NonBMP1.utf8 15 Mar 2014 14:55:05 -0000 1.1
@@ -0,0 +1,7 @@
+************ Non BMP1 characters ****************
+ððððð¿¿
+ðð ñòó¿¿¿
+ôõö÷÷¿¿¿
+øøø ø¿¿¿¿
+ùúüü
+üü ýý¿¿¿¿¿
Index: tests/TestPage.ucs2be.txt
===================================================================
RCS file: tests/TestPage.ucs2be.txt
diff -N tests/TestPage.ucs2be.txt
Binary files /dev/null and /tmp/cvs9PqdKt differ
Index: tests/TestPage.ucs2le.txt
===================================================================
RCS file: tests/TestPage.ucs2le.txt
diff -N tests/TestPage.ucs2le.txt
Binary files /dev/null and /tmp/cvsmd7zos differ
Index: tests/TestPage.ucs4be.txt
===================================================================
RCS file: tests/TestPage.ucs4be.txt
diff -N tests/TestPage.ucs4be.txt
Binary files /dev/null and /tmp/cvsjBTO6r differ
Index: tests/TestPage.ucs4le.txt
===================================================================
RCS file: tests/TestPage.ucs4le.txt
diff -N tests/TestPage.ucs4le.txt
Binary files /dev/null and /tmp/cvsq42NYs differ
[Prev in Thread] |
Current Thread |
[Next in Thread] |
- [Qemacs-commit] qemacs charset.c tests/NonBMP1.utf8 tests/TestP...,
Charlie Gordon <=