help-smalltalk
[Top][All Lists]
Advanced

[Date Prev][Date Next][Thread Prev][Thread Next][Date Index][Thread Index]

[Help-smalltalk] [PATCH] fix utf7


From: Paolo Bonzini
Subject: [Help-smalltalk] [PATCH] fix utf7
Date: Sun, 01 Jul 2007 22:21:18 +0200
User-agent: Thunderbird 2.0.0.4 (Macintosh/20070604)

...

Paolo
2007-07-01  Paolo Bonzini  <address@hidden>

        * iconv/iconvtests.st: Add UTF-7 tests.
        * iconv/Sets.st: Rewrite FromUTF7.
 


--- orig/packages/iconv/Sets.st
+++ mod/packages/iconv/Sets.st
@@ -990,9 +995,8 @@ next
        "Convert to a surrogate pair"
        ch := ch - 16r10000.
        left := 32.
-       value := (((ch bitAnd: 16rFFF) + 16rD800) bitShift: 16) +
+       value := (((ch bitAnd: 16r3FF) + 16rD800) bitShift: 16) +
                 ((ch bitShift: -10) + 16rDC00).
-
        ^$+
     ].
 
@@ -1154,7 +1158,7 @@ getNext
        ch == 45 "minus" ifTrue: [ ^$+ ].
 
        "Else switch into base64 mode"
-       shift := 32.
+       shift := 16.
        wch := 0.
     ].
 
@@ -1164,7 +1168,7 @@ getNext
            "Terminate base64 encoding.
             If accumulated data is nonzero, the input is invalid.
             Also, partial UTF-16 characters are invalid."
-           (shift <= 26 or: [ wch > 0 ]) ifTrue: [
+           (shift <= 10 or: [ wch > 0 ]) ifTrue: [
                shift := 0.
                InvalidSequenceError signal ].
 
@@ -1175,43 +1179,34 @@ getNext
            ^self getNext
        ].
 
-    "Concatenate the base64 integer value to the accumulator"
-    shift > 6 ifTrue: [
-       shift := shift - 6.
-       wch := wch + (value bitShift: shift).
-       shift > 16 ifTrue: [ ^self getNext ].
-
-       (shift between: 11 and: 16) ifTrue: [
-           "Completed an UTF-16 character.  When we see a High
-            Surrogate, we must wait for the following Low Surrogate."
-           wc1 := wch bitShift: -16.
-
-           (wc1 between: 16rD800 and: 16rDBFF) ifFalse: [
-               wch := (wch bitAnd: 65535) bitShift: 16.
-               shift := shift + 16.
-               ^Character codePoint: wc1
-           ].
-       ].
-
-       (shift between: 5 and: 10) ifTrue: [
+    shift <= 6 ifTrue: [
+        wc1 := wch + (value bitShift: shift - 6).
+        wch := ((value bitShift: shift) bitAnd: 16r3F) bitShift: 10.
+        shift := shift + 10.
+
+       (wc1 between: 16rDC00 and: 16rDFFF) ifTrue: [
+           InvalidSequenceError signal ].
+       wc1 >= 16r4000000 ifTrue: [
            "After an High Surrogate, verify that the next character
             is indeed a Low Surrogate"
-           wc1 := wch bitAnd: 65535.
-           (wc1 between: 16rDC00 and: 16rDFFF) ifFalse: [
-               shift := 0.
-               InvalidSequenceError signal ]
-       ].
-    ].
+           (wc1 between: 16r400DC00 and: 16r7FFDFFF) ifTrue: [
+               wc1 := ((wc1 bitAnd: 16r3FF0000) bitShift: -6)
+                      + (wc1 bitAnd: 16r3FF) + 16r10000.
+               ^Character codePoint: wc1 ].
+           InvalidSequenceError signal ].
+       (wc1 between: 16rD800 and: 16rDBFF) ifFalse: [
+           ^Character codePoint: wc1 ].
+       shift = 0 ifTrue: [
+           InvalidSequenceError signal ].
+
+        "Read a High Surrogate."
+       wch := wch + (wc1 - 16rD400 bitShift: 16).
+       ^self getNext ].
 
-    "Completed an UTF-16 surrogate pair"
-
-    "35FDC00 = -0xD800 << 10 - 0xDC00 + 0x10000"
-    wc1 := wch bitShift: -16.
-    wch := (wch bitAnd: 65535) + (value bitShift: shift - 6).
-    wc1 := (wc1 bitShift: 10) + wch - 16r35FDC00.
-
-    wch := ((value bitShift: shift) bitAnd: 16r3F) bitShift: 26.
-    ^Character codePoint: wc1
+    "Concatenate the base64 integer value to the accumulator"
+    shift := shift - 6.
+    wch := wch + (value bitShift: shift).
+    ^self getNext!
 ! !
 
 


--- orig/packages/iconv/iconvtests.st
+++ mod/packages/iconv/iconvtests.st
@@ -113,4 +113,32 @@ testByteArrayAsUnicodeStringColon
     str := #[239 191 190].
     self assert: (str asUnicodeString: 'UTF-8') first = $<16rFFFE>.
     str := #[208 184].
-    self assert: (str asUnicodeString: 'UTF-8') first = $<16r438>! !
+    self assert: (str asUnicodeString: 'UTF-8') first = $<16r438>!
+
+testFromUTF7
+    self assert: ('+-' asUnicodeString: 'UTF-7') first = $+.
+    self assert: ('+BBg-' asUnicodeString: 'UTF-7') first = $<16r418>.
+    self assert: ('+BBgEOA-' asUnicodeString: 'UTF-7') second = $<16r438>.
+    self assert: ('+BBgEOAQZ-' asUnicodeString: 'UTF-7') third = $<16r419>.
+    self assert: ('+2//f/w-' asUnicodeString: 'UTF-7') first = $<16r10FFFF>.
+    self assert: ('+2//f/w-' asUnicodeString: 'UTF-7') size = 1.
+    self assert: ('+BDjb/9//-' asUnicodeString: 'UTF-7') last = $<16r10FFFF>.
+    self assert: ('+BDjb/9//-' asUnicodeString: 'UTF-7') size = 2.
+    self assert: ('+BDgEGNv/3/8-' asUnicodeString: 'UTF-7') last = 
$<16r10FFFF>.
+    self assert: ('+BDgEGNv/3/8-' asUnicodeString: 'UTF-7') size = 3!
+
+testToUTF7
+    | str |
+    self assert: ((UnicodeString with: $+) asString: 'UTF-7') asString = '+-'.
+    str := UnicodeString with: $<16r418>.
+    self assert: (str asString: 'UTF-7') encoding = 'UTF-7'.
+    self assert: (str asString: 'UTF-7') asString = '+BBg-'.
+    str := str copyWith: $<16r438>.
+    self assert: (str asString: 'UTF-7') asString = '+BBgEOA-'.
+    str := str copyWith: $<16r419>.
+    self assert: (str asString: 'UTF-7') asString = '+BBgEOAQZ-'.
+    str := UnicodeString with: $<16r10FFFF>.
+    self assert: (str asString: 'UTF-7') asString = '+2//f/w-'.
+!
+
+ !




reply via email to

[Prev in Thread] Current Thread [Next in Thread]