[Top][All Lists]
[Date Prev][Date Next][Thread Prev][Thread Next][Date Index][Thread Index]
[Gnash-commit] gnash ChangeLog libbase/utf8.cpp libbase/utf8.h...
From: |
Sandro Santilli |
Subject: |
[Gnash-commit] gnash ChangeLog libbase/utf8.cpp libbase/utf8.h... |
Date: |
Thu, 17 Apr 2008 08:05:37 +0000 |
CVSROOT: /sources/gnash
Module name: gnash
Changes by: Sandro Santilli <strk> 08/04/17 08:05:37
Modified files:
. : ChangeLog
libbase : utf8.cpp utf8.h
server : LoadVariablesThread.cpp
server/asobj : LoadVars.cpp
Log message:
* libbase/utf8.{cpp,h}: add a stripBOM interface and a TextEncoding
enum to deal with BOMs.
* server/LoadVariablesThread.cpp: handle BOMs, fixing failure in
actionscript.all/MovieClip.as
* server/asobj/LoadVars.cpp: delegate BOM handling to the new
specialized utf8:: interface.
CVSWeb URLs:
http://cvs.savannah.gnu.org/viewcvs/gnash/ChangeLog?cvsroot=gnash&r1=1.6308&r2=1.6309
http://cvs.savannah.gnu.org/viewcvs/gnash/libbase/utf8.cpp?cvsroot=gnash&r1=1.10&r2=1.11
http://cvs.savannah.gnu.org/viewcvs/gnash/libbase/utf8.h?cvsroot=gnash&r1=1.16&r2=1.17
http://cvs.savannah.gnu.org/viewcvs/gnash/server/LoadVariablesThread.cpp?cvsroot=gnash&r1=1.10&r2=1.11
http://cvs.savannah.gnu.org/viewcvs/gnash/server/asobj/LoadVars.cpp?cvsroot=gnash&r1=1.46&r2=1.47
Patches:
Index: ChangeLog
===================================================================
RCS file: /sources/gnash/gnash/ChangeLog,v
retrieving revision 1.6308
retrieving revision 1.6309
diff -u -b -r1.6308 -r1.6309
--- ChangeLog 16 Apr 2008 23:07:14 -0000 1.6308
+++ ChangeLog 17 Apr 2008 08:05:36 -0000 1.6309
@@ -1,3 +1,12 @@
+2008-04-17 Sandro Santilli <address@hidden>
+
+ * libbase/utf8.{cpp,h}: add a stripBOM interface and a TextEncoding
+ enum to deal with BOMs.
+ * server/LoadVariablesThread.cpp: handle BOMs, fixing failure in
+ actionscript.all/MovieClip.as
+ * server/asobj/LoadVars.cpp: delegate BOM handling to the new
+ specialized utf8:: interface.
+
2008-04-16 Sandro Santilli <address@hidden>
* testsuite/actionscript.all/LoadVars.as: check that getBytesLoaded()
Index: libbase/utf8.cpp
===================================================================
RCS file: /sources/gnash/gnash/libbase/utf8.cpp,v
retrieving revision 1.10
retrieving revision 1.11
diff -u -b -r1.10 -r1.11
--- libbase/utf8.cpp 28 Mar 2008 13:52:30 -0000 1.10
+++ libbase/utf8.cpp 17 Apr 2008 08:05:36 -0000 1.11
@@ -244,6 +244,84 @@
return text;
}
+
+#define ENC_DEFAULT 0
+#define ENC_UTF8 1
+#define ENC_UTF16BE 2
+#define ENC_UTF16LE 3
+
+char*
+utf8::stripBOM(char* in, size_t& size, TextEncoding& encoding)
+{
+ encoding = encUNSPECIFIED;
+ if ( size > 2 )
+ {
+ // need *ptr to be unsigned or cast all 0xNN
+ unsigned char* ptr = reinterpret_cast<unsigned char*>(in);
+
+ if ( *ptr == 0xFF && *(ptr+1) == 0xFE )
+ {
+ // Text is UTF-16 LE
+ encoding = encUTF16LE;
+ in+=2;
+ size-=2;
+ }
+ else if ( *ptr == 0xFE && *(ptr+1) == 0xFF )
+ {
+ // Text is UTF-16 BE
+ encoding = encUTF16BE;
+ in+=2;
+ size-=2;
+ }
+ else if ( size > 3 && *ptr == 0xEF && *(ptr+1) == 0xBB &&
*(ptr+2) == 0xBF )
+ {
+ // Text is UTF-8
+ encoding = encUTF8;
+ in+=3;
+ size-=3;
+ }
+ else if ( size > 4 && *ptr == 0x00 && *(ptr+1) == 0x00 &&
*(ptr+2) == 0xFE && *(ptr+3) == 0xFF )
+ {
+ // Text is UTF-32 BE
+ encoding = encUTF32BE;
+ in+=4;
+ size-=4;
+ }
+ else if ( size > 4 && *ptr == 0xFF && *(ptr+1) == 0xFE &&
*(ptr+2) == 0x00 && *(ptr+3) == 0x00 )
+ {
+ // Text is UTF-32 LE
+ encoding = encUTF32LE;
+ in+=4;
+ size-=4;
+ }
+
+ // TODO: check other kinds of boms !
+ // See
http://en.wikipedia.org/wiki/Byte-order_mark#Representations_of_byte_order_marks_by_encoding
+ }
+
+ return in;
+}
+
+const char*
+utf8::textEncodingName(TextEncoding enc)
+{
+ switch (enc)
+ {
+ case encUNSPECIFIED: return "Unspecified";
+ case encUTF8: return "UTF8";
+ case encUTF16BE: return "UTF16BE";
+ case encUTF16LE: return "UTF16LE";
+ case encUTF32BE: return "UTF32BE";
+ case encUTF32LE: return "UTF32LE";
+ case encSCSU: return "SCSU";
+ case encUTF7: return "UTF7";
+ case encUTFEBCDIC: return "UTFEBCDIC";
+ case encBOCU1: return "BOCU1";
+ default: return "INVALID";
+ }
+}
+
+
// Local Variables:
// mode: C++
// c-basic-offset: 8
Index: libbase/utf8.h
===================================================================
RCS file: /sources/gnash/gnash/libbase/utf8.h,v
retrieving revision 1.16
retrieving revision 1.17
diff -u -b -r1.16 -r1.17
--- libbase/utf8.h 28 Mar 2008 15:13:34 -0000 1.16
+++ libbase/utf8.h 17 Apr 2008 08:05:37 -0000 1.17
@@ -95,6 +95,48 @@
/// Allows storage of Latin1 (ISO-8859-1) characters. This
/// is the format of SWF5 and below.
std::string encodeLatin1Character(boost::uint32_t ucsCharacter);
+
+ enum TextEncoding {
+ encUNSPECIFIED,
+ encUTF8,
+ encUTF16BE,
+ encUTF16LE,
+ encUTF32BE,
+ encUTF32LE,
+ encSCSU,
+ encUTF7,
+ encUTFEBCDIC,
+ encBOCU1
+ };
+
+ /// Interpret (and skip) Byte Order Mark in input stream
+ //
+ /// This function takes a pointer to a buffer and returns
+ /// the start of actual data after an eventual BOM.
+ /// No conversion is performed, no bytes copy, just skipping of
+ /// the BOM snippet and interpretation of it returned to the
+ /// encoding input parameter.
+ ///
+ /// See http://en.wikipedia.org/wiki/Byte-order_mark
+ ///
+ /// @param in
+ /// The input buffer.
+ ///
+ /// @param size
+ /// Size of the input buffer, will be decremented by the
+ /// size of the BOM, if any.
+ ///
+ /// @param encoding
+ /// Output parameter, will always be set.
+ /// encUNSPECIFIED if no BOM is found.
+ ///
+ /// @returns
+ /// A pointer either equal to 'in' or some bytes inside it.
+ ///
+ char* stripBOM(char* in, size_t& size, TextEncoding& encoding);
+
+ /// Return name of a text encoding
+ const char* textEncodingName(TextEncoding enc);
}
Index: server/LoadVariablesThread.cpp
===================================================================
RCS file: /sources/gnash/gnash/server/LoadVariablesThread.cpp,v
retrieving revision 1.10
retrieving revision 1.11
diff -u -b -r1.10 -r1.11
--- server/LoadVariablesThread.cpp 15 Apr 2008 11:27:19 -0000 1.10
+++ server/LoadVariablesThread.cpp 17 Apr 2008 08:05:37 -0000 1.11
@@ -24,6 +24,7 @@
#include "tu_file.h"
#include "log.h"
#include "GnashException.h"
+#include "utf8.h"
#include <string>
@@ -50,17 +51,32 @@
string toparse;
size_t CHUNK_SIZE = 1024;
- char *buf = new char[CHUNK_SIZE];
+ char* buf = new char[CHUNK_SIZE];
unsigned int parsedLines = 0;
+ // TODO: use read_string ?
while ( size_t read = _stream->read_bytes(buf, CHUNK_SIZE) )
{
#ifdef DEBUG_LOAD_VARIABLES
log_debug("Read %u bytes", read);
#endif
- // TODO: use read_string ?
+ if ( _bytesLoaded )
+ {
string chunk(buf, read);
toparse += chunk;
+ }
+ else
+ {
+ size_t dataSize = read;
+ utf8::TextEncoding encoding;
+ char* ptr = utf8::stripBOM(buf, dataSize, encoding);
+ if ( encoding != utf8::encUTF8 && encoding !=
utf8::encUNSPECIFIED )
+ {
+ log_unimpl("%s to utf8 conversion in
MovieClip.loadVariables input parsing", utf8::textEncodingName(encoding));
+ }
+ string chunk(ptr, dataSize);
+ toparse += chunk;
+ }
#ifdef DEBUG_LOAD_VARIABLES
log_debug("toparse: %s", toparse.c_str());
Index: server/asobj/LoadVars.cpp
===================================================================
RCS file: /sources/gnash/gnash/server/asobj/LoadVars.cpp,v
retrieving revision 1.46
retrieving revision 1.47
diff -u -b -r1.46 -r1.47
--- server/asobj/LoadVars.cpp 16 Apr 2008 23:07:15 -0000 1.46
+++ server/asobj/LoadVars.cpp 17 Apr 2008 08:05:37 -0000 1.47
@@ -38,6 +38,7 @@
#include "Object.h" // for getObjectInterface
#include "LoadThread.h"
#include "namedStrings.h"
+#include "utf8.h"
#include <list>
#include <boost/algorithm/string/case_conv.hpp>
@@ -259,35 +260,15 @@
#endif
}
buf[actuallyRead] = '\0';
+
// Strip BOM, if any.
// See http://savannah.gnu.org/bugs/?19915
- char* bufptr = buf.get();
- log_debug("xmlsize:%d, ptr:%s", xmlsize, bufptr);
- if ( xmlsize > 2 )
- {
- // need *ptr to be unsigned or cast all 0xNN
- unsigned char* ptr = reinterpret_cast<unsigned char*>(bufptr);
-
- if ( *ptr == 0xFF && *(ptr+1) == 0xFE )
- {
- // Text is UTF-16 LE,
- // we should convert to UTF-8
- log_unimpl("Conversion from UTF-16 LE to UTF-8
(LoadVars)");
- bufptr+=2;
- }
- else if ( *ptr == 0xFE && *(ptr+1) == 0xFF )
- {
- // Text is UTF-16 BE,
- // we should convert to UTF-8
- log_unimpl("Conversion from UTF-16 BE to UTF-8
(LoadVars)");
- bufptr+=2;
- }
- else if ( xmlsize > 3 && *ptr == 0xEF && *(ptr+1) == 0xBB &&
*(ptr+2) == 0xBF )
+ utf8::TextEncoding encoding;
+ // NOTE: the call below will possibly change 'xmlsize' parameter
+ char* bufptr = utf8::stripBOM(buf.get(), xmlsize, encoding);
+ if ( encoding != utf8::encUTF8 && encoding != utf8::encUNSPECIFIED
)
{
- log_debug("UTF8 bom (LoadVars)");
- // Text is UTF-8
- bufptr+=3;
- }
+ log_unimpl("%s to utf8 conversion in LoadVars input parsing",
utf8::textEncodingName(encoding));
}
as_value dataVal(bufptr); // memory copy here (optimize?)
- [Gnash-commit] gnash ChangeLog libbase/utf8.cpp libbase/utf8.h...,
Sandro Santilli <=