[Gnash-commit] gnash ChangeLog libbase/utf8.cpp libbase/utf8.h...

gnash-commit
[Date Prev][Date Next][Thread Prev][Thread Next][Date Index][Thread Index]
[Gnash-commit] gnash ChangeLog libbase/utf8.cpp libbase/utf8.h...

From:	Sandro Santilli
Subject:	[Gnash-commit] gnash ChangeLog libbase/utf8.cpp libbase/utf8.h...
Date:	Thu, 17 Apr 2008 08:05:37 +0000
CVSROOT:        /sources/gnash
Module name:    gnash
Changes by:     Sandro Santilli <strk>  08/04/17 08:05:37

Modified files:
        .              : ChangeLog 
        libbase        : utf8.cpp utf8.h 
        server         : LoadVariablesThread.cpp 
        server/asobj   : LoadVars.cpp 

Log message:
        * libbase/utf8.{cpp,h}: add a stripBOM interface and a TextEncoding
          enum to deal with BOMs.
        * server/LoadVariablesThread.cpp: handle BOMs, fixing failure in
          actionscript.all/MovieClip.as
        * server/asobj/LoadVars.cpp: delegate BOM handling to the new
          specialized utf8:: interface.

CVSWeb URLs:
http://cvs.savannah.gnu.org/viewcvs/gnash/ChangeLog?cvsroot=gnash&r1=1.6308&r2=1.6309
http://cvs.savannah.gnu.org/viewcvs/gnash/libbase/utf8.cpp?cvsroot=gnash&r1=1.10&r2=1.11
http://cvs.savannah.gnu.org/viewcvs/gnash/libbase/utf8.h?cvsroot=gnash&r1=1.16&r2=1.17
http://cvs.savannah.gnu.org/viewcvs/gnash/server/LoadVariablesThread.cpp?cvsroot=gnash&r1=1.10&r2=1.11
http://cvs.savannah.gnu.org/viewcvs/gnash/server/asobj/LoadVars.cpp?cvsroot=gnash&r1=1.46&r2=1.47

Patches:
Index: ChangeLog
===================================================================
RCS file: /sources/gnash/gnash/ChangeLog,v
retrieving revision 1.6308
retrieving revision 1.6309
diff -u -b -r1.6308 -r1.6309
--- ChangeLog   16 Apr 2008 23:07:14 -0000      1.6308
+++ ChangeLog   17 Apr 2008 08:05:36 -0000      1.6309
@@ -1,3 +1,12 @@
+2008-04-17 Sandro Santilli <address@hidden>
+
+       * libbase/utf8.{cpp,h}: add a stripBOM interface and a TextEncoding
+         enum to deal with BOMs.
+       * server/LoadVariablesThread.cpp: handle BOMs, fixing failure in
+         actionscript.all/MovieClip.as
+       * server/asobj/LoadVars.cpp: delegate BOM handling to the new
+         specialized utf8:: interface.
+
 2008-04-16 Sandro Santilli <address@hidden>
 
        * testsuite/actionscript.all/LoadVars.as: check that getBytesLoaded()

Index: libbase/utf8.cpp
===================================================================
RCS file: /sources/gnash/gnash/libbase/utf8.cpp,v
retrieving revision 1.10
retrieving revision 1.11
diff -u -b -r1.10 -r1.11
--- libbase/utf8.cpp    28 Mar 2008 13:52:30 -0000      1.10
+++ libbase/utf8.cpp    17 Apr 2008 08:05:36 -0000      1.11
@@ -244,6 +244,84 @@
        return text;
 }
 
+
+#define ENC_DEFAULT 0
+#define ENC_UTF8 1
+#define ENC_UTF16BE 2
+#define ENC_UTF16LE 3
+
+char*
+utf8::stripBOM(char* in, size_t& size, TextEncoding& encoding)
+{
+       encoding = encUNSPECIFIED;
+       if ( size > 2 )
+       {
+               // need *ptr to be unsigned or cast all 0xNN
+               unsigned char* ptr = reinterpret_cast<unsigned char*>(in);
+
+               if ( *ptr == 0xFF && *(ptr+1) == 0xFE )
+               {
+                       // Text is UTF-16 LE
+                       encoding = encUTF16LE;
+                       in+=2;
+                       size-=2;
+               }
+               else if ( *ptr == 0xFE && *(ptr+1) == 0xFF )
+               {
+                       // Text is UTF-16 BE
+                       encoding = encUTF16BE;
+                       in+=2;
+                       size-=2;
+               }
+               else if ( size > 3 && *ptr == 0xEF && *(ptr+1) == 0xBB && 
*(ptr+2) == 0xBF )
+               {
+                       // Text is UTF-8
+                       encoding = encUTF8;
+                       in+=3;
+                       size-=3;
+               }
+               else if ( size > 4 && *ptr == 0x00 && *(ptr+1) == 0x00 && 
*(ptr+2) == 0xFE && *(ptr+3) == 0xFF )
+               {
+                       // Text is UTF-32 BE
+                       encoding = encUTF32BE;
+                       in+=4;
+                       size-=4;
+               }
+               else if ( size > 4 && *ptr == 0xFF && *(ptr+1) == 0xFE && 
*(ptr+2) == 0x00 && *(ptr+3) == 0x00 )
+               {
+                       // Text is UTF-32 LE
+                       encoding = encUTF32LE;
+                       in+=4;
+                       size-=4;
+               }
+
+               // TODO: check other kinds of boms !
+               // See 
http://en.wikipedia.org/wiki/Byte-order_mark#Representations_of_byte_order_marks_by_encoding
+       }
+
+       return in;
+}
+
+const char*
+utf8::textEncodingName(TextEncoding enc)
+{
+       switch (enc)
+       {
+               case encUNSPECIFIED: return "Unspecified";
+               case encUTF8: return "UTF8";
+               case encUTF16BE: return "UTF16BE";
+               case encUTF16LE: return "UTF16LE";
+               case encUTF32BE: return "UTF32BE";
+               case encUTF32LE: return "UTF32LE";
+               case encSCSU: return "SCSU";
+               case encUTF7: return "UTF7";
+               case encUTFEBCDIC: return "UTFEBCDIC";
+               case encBOCU1: return "BOCU1";
+               default: return "INVALID";
+       }
+}
+
+
 // Local Variables:
 // mode: C++
 // c-basic-offset: 8 

Index: libbase/utf8.h
===================================================================
RCS file: /sources/gnash/gnash/libbase/utf8.h,v
retrieving revision 1.16
retrieving revision 1.17
diff -u -b -r1.16 -r1.17
--- libbase/utf8.h      28 Mar 2008 15:13:34 -0000      1.16
+++ libbase/utf8.h      17 Apr 2008 08:05:37 -0000      1.17
@@ -95,6 +95,48 @@
        /// Allows storage of Latin1 (ISO-8859-1) characters. This
        /// is the format of SWF5 and below.
        std::string encodeLatin1Character(boost::uint32_t ucsCharacter);
+
+       enum TextEncoding {
+               encUNSPECIFIED,
+               encUTF8,
+               encUTF16BE,
+               encUTF16LE,
+               encUTF32BE,
+               encUTF32LE,
+               encSCSU,
+               encUTF7,
+               encUTFEBCDIC,
+               encBOCU1
+       };
+
+       /// Interpret (and skip) Byte Order Mark in input stream
+       //
+       /// This function takes a pointer to a buffer and returns
+       /// the start of actual data after an eventual BOM.
+       /// No conversion is performed, no bytes copy, just skipping of
+       /// the BOM snippet and interpretation of it returned to the
+       /// encoding input parameter.
+       ///
+       /// See http://en.wikipedia.org/wiki/Byte-order_mark
+       ///
+       /// @param in
+       ///     The input buffer.
+       ///
+       /// @param size
+       ///     Size of the input buffer, will be decremented by the
+       ///     size of the BOM, if any.
+       ///
+       /// @param encoding
+       ///     Output parameter, will always be set.
+       ///     encUNSPECIFIED if no BOM is found.
+       ///
+       /// @returns
+       ///     A pointer either equal to 'in' or some bytes inside it.
+       ///
+       char* stripBOM(char* in, size_t& size, TextEncoding& encoding);
+
+       /// Return name of a text encoding
+       const char* textEncodingName(TextEncoding enc);
 }
 
 

Index: server/LoadVariablesThread.cpp
===================================================================
RCS file: /sources/gnash/gnash/server/LoadVariablesThread.cpp,v
retrieving revision 1.10
retrieving revision 1.11
diff -u -b -r1.10 -r1.11
--- server/LoadVariablesThread.cpp      15 Apr 2008 11:27:19 -0000      1.10
+++ server/LoadVariablesThread.cpp      17 Apr 2008 08:05:37 -0000      1.11
@@ -24,6 +24,7 @@
 #include "tu_file.h"
 #include "log.h"
 #include "GnashException.h"
+#include "utf8.h"
 
 #include <string>
 
@@ -50,17 +51,32 @@
        string toparse;
 
        size_t CHUNK_SIZE = 1024;
-       char *buf = new char[CHUNK_SIZE];
+       char* buf = new char[CHUNK_SIZE];
        unsigned int parsedLines = 0;
+       // TODO: use read_string ?
        while ( size_t read = _stream->read_bytes(buf, CHUNK_SIZE) )
        {
 #ifdef DEBUG_LOAD_VARIABLES
                log_debug("Read %u bytes", read);
 #endif
 
-               // TODO: use read_string ?
+               if ( _bytesLoaded )
+               {
                string chunk(buf, read);
                toparse += chunk;
+               }
+               else
+               {
+                       size_t dataSize = read;
+                       utf8::TextEncoding encoding;
+                       char* ptr = utf8::stripBOM(buf, dataSize, encoding);
+                       if ( encoding != utf8::encUTF8 && encoding != 
utf8::encUNSPECIFIED )
+                       {
+                               log_unimpl("%s to utf8 conversion in 
MovieClip.loadVariables input parsing", utf8::textEncodingName(encoding));
+                       }
+                       string chunk(ptr, dataSize);
+                       toparse += chunk;
+               }
 
 #ifdef DEBUG_LOAD_VARIABLES
                log_debug("toparse: %s", toparse.c_str());

Index: server/asobj/LoadVars.cpp
===================================================================
RCS file: /sources/gnash/gnash/server/asobj/LoadVars.cpp,v
retrieving revision 1.46
retrieving revision 1.47
diff -u -b -r1.46 -r1.47
--- server/asobj/LoadVars.cpp   16 Apr 2008 23:07:15 -0000      1.46
+++ server/asobj/LoadVars.cpp   17 Apr 2008 08:05:37 -0000      1.47
@@ -38,6 +38,7 @@
 #include "Object.h" // for getObjectInterface
 #include "LoadThread.h"
 #include "namedStrings.h"
+#include "utf8.h"
 
 #include <list>
 #include <boost/algorithm/string/case_conv.hpp>
@@ -259,35 +260,15 @@
 #endif
                        }
             buf[actuallyRead] = '\0';
+
             // Strip BOM, if any.
             // See http://savannah.gnu.org/bugs/?19915
-            char* bufptr = buf.get();
-            log_debug("xmlsize:%d, ptr:%s", xmlsize, bufptr);
-            if ( xmlsize > 2 )
-            {
-                 // need *ptr to be unsigned or cast all 0xNN
-                 unsigned char* ptr = reinterpret_cast<unsigned char*>(bufptr);
-
-                 if ( *ptr == 0xFF && *(ptr+1) == 0xFE )
-                 {
-                     // Text is UTF-16 LE,
-                     // we should convert to UTF-8
-                     log_unimpl("Conversion from UTF-16 LE to UTF-8 
(LoadVars)");
-                     bufptr+=2;
-                 }
-                 else if ( *ptr == 0xFE && *(ptr+1) == 0xFF )
-                 {
-                     // Text is UTF-16 BE, 
-                     // we should convert to UTF-8
-                     log_unimpl("Conversion from UTF-16 BE to UTF-8 
(LoadVars)");
-                     bufptr+=2;
-                 }
-                 else if ( xmlsize > 3 && *ptr == 0xEF && *(ptr+1) == 0xBB && 
*(ptr+2) == 0xBF )
+            utf8::TextEncoding encoding;
+            // NOTE: the call below will possibly change 'xmlsize' parameter
+            char* bufptr = utf8::stripBOM(buf.get(), xmlsize, encoding);
+            if ( encoding != utf8::encUTF8 && encoding != utf8::encUNSPECIFIED 
)
                  {
-                     log_debug("UTF8 bom (LoadVars)");
-                     // Text is UTF-8
-                     bufptr+=3;
-                 }
+                log_unimpl("%s to utf8 conversion in LoadVars input parsing", 
utf8::textEncodingName(encoding));
             }
             as_value dataVal(bufptr); // memory copy here (optimize?)
[Prev in Thread]
Current Thread
[Next in Thread]
[Gnash-commit] gnash ChangeLog libbase/utf8.cpp libbase/utf8.h..., Sandro Santilli <=
- [Gnash-commit] gnash ChangeLog libbase/utf8.cpp libbase/utf8.h..., Benjamin Wolsey, 2008/04/28
  - Re: [Gnash-commit] gnash ChangeLog libbase/utf8.cpp libbase/utf8.h..., zou lunkai, 2008/04/28
Prev by Date: [Gnash-commit] gnash ChangeLog server/asobj/LoadVars.cpp tests...
Next by Date: [Gnash-commit] gnash ChangeLog server/asobj/xml.cpp testsuite/...
Previous by thread: [Gnash-commit] gnash ChangeLog server/asobj/LoadVars.cpp tests...
Next by thread: [Gnash-commit] gnash ChangeLog libbase/utf8.cpp libbase/utf8.h...
Index(es):
- Date
- Thread