[Top][All Lists]
[Date Prev][Date Next][Thread Prev][Thread Next][Date Index][Thread Index]
[Help-smalltalk] [PATCH] fix utf7
From: |
Paolo Bonzini |
Subject: |
[Help-smalltalk] [PATCH] fix utf7 |
Date: |
Sun, 01 Jul 2007 22:21:18 +0200 |
User-agent: |
Thunderbird 2.0.0.4 (Macintosh/20070604) |
...
Paolo
2007-07-01 Paolo Bonzini <address@hidden>
* iconv/iconvtests.st: Add UTF-7 tests.
* iconv/Sets.st: Rewrite FromUTF7.
--- orig/packages/iconv/Sets.st
+++ mod/packages/iconv/Sets.st
@@ -990,9 +995,8 @@ next
"Convert to a surrogate pair"
ch := ch - 16r10000.
left := 32.
- value := (((ch bitAnd: 16rFFF) + 16rD800) bitShift: 16) +
+ value := (((ch bitAnd: 16r3FF) + 16rD800) bitShift: 16) +
((ch bitShift: -10) + 16rDC00).
-
^$+
].
@@ -1154,7 +1158,7 @@ getNext
ch == 45 "minus" ifTrue: [ ^$+ ].
"Else switch into base64 mode"
- shift := 32.
+ shift := 16.
wch := 0.
].
@@ -1164,7 +1168,7 @@ getNext
"Terminate base64 encoding.
If accumulated data is nonzero, the input is invalid.
Also, partial UTF-16 characters are invalid."
- (shift <= 26 or: [ wch > 0 ]) ifTrue: [
+ (shift <= 10 or: [ wch > 0 ]) ifTrue: [
shift := 0.
InvalidSequenceError signal ].
@@ -1175,43 +1179,34 @@ getNext
^self getNext
].
- "Concatenate the base64 integer value to the accumulator"
- shift > 6 ifTrue: [
- shift := shift - 6.
- wch := wch + (value bitShift: shift).
- shift > 16 ifTrue: [ ^self getNext ].
-
- (shift between: 11 and: 16) ifTrue: [
- "Completed an UTF-16 character. When we see a High
- Surrogate, we must wait for the following Low Surrogate."
- wc1 := wch bitShift: -16.
-
- (wc1 between: 16rD800 and: 16rDBFF) ifFalse: [
- wch := (wch bitAnd: 65535) bitShift: 16.
- shift := shift + 16.
- ^Character codePoint: wc1
- ].
- ].
-
- (shift between: 5 and: 10) ifTrue: [
+ shift <= 6 ifTrue: [
+ wc1 := wch + (value bitShift: shift - 6).
+ wch := ((value bitShift: shift) bitAnd: 16r3F) bitShift: 10.
+ shift := shift + 10.
+
+ (wc1 between: 16rDC00 and: 16rDFFF) ifTrue: [
+ InvalidSequenceError signal ].
+ wc1 >= 16r4000000 ifTrue: [
"After an High Surrogate, verify that the next character
is indeed a Low Surrogate"
- wc1 := wch bitAnd: 65535.
- (wc1 between: 16rDC00 and: 16rDFFF) ifFalse: [
- shift := 0.
- InvalidSequenceError signal ]
- ].
- ].
+ (wc1 between: 16r400DC00 and: 16r7FFDFFF) ifTrue: [
+ wc1 := ((wc1 bitAnd: 16r3FF0000) bitShift: -6)
+ + (wc1 bitAnd: 16r3FF) + 16r10000.
+ ^Character codePoint: wc1 ].
+ InvalidSequenceError signal ].
+ (wc1 between: 16rD800 and: 16rDBFF) ifFalse: [
+ ^Character codePoint: wc1 ].
+ shift = 0 ifTrue: [
+ InvalidSequenceError signal ].
+
+ "Read a High Surrogate."
+ wch := wch + (wc1 - 16rD400 bitShift: 16).
+ ^self getNext ].
- "Completed an UTF-16 surrogate pair"
-
- "35FDC00 = -0xD800 << 10 - 0xDC00 + 0x10000"
- wc1 := wch bitShift: -16.
- wch := (wch bitAnd: 65535) + (value bitShift: shift - 6).
- wc1 := (wc1 bitShift: 10) + wch - 16r35FDC00.
-
- wch := ((value bitShift: shift) bitAnd: 16r3F) bitShift: 26.
- ^Character codePoint: wc1
+ "Concatenate the base64 integer value to the accumulator"
+ shift := shift - 6.
+ wch := wch + (value bitShift: shift).
+ ^self getNext!
! !
--- orig/packages/iconv/iconvtests.st
+++ mod/packages/iconv/iconvtests.st
@@ -113,4 +113,32 @@ testByteArrayAsUnicodeStringColon
str := #[239 191 190].
self assert: (str asUnicodeString: 'UTF-8') first = $<16rFFFE>.
str := #[208 184].
- self assert: (str asUnicodeString: 'UTF-8') first = $<16r438>! !
+ self assert: (str asUnicodeString: 'UTF-8') first = $<16r438>!
+
+testFromUTF7
+ self assert: ('+-' asUnicodeString: 'UTF-7') first = $+.
+ self assert: ('+BBg-' asUnicodeString: 'UTF-7') first = $<16r418>.
+ self assert: ('+BBgEOA-' asUnicodeString: 'UTF-7') second = $<16r438>.
+ self assert: ('+BBgEOAQZ-' asUnicodeString: 'UTF-7') third = $<16r419>.
+ self assert: ('+2//f/w-' asUnicodeString: 'UTF-7') first = $<16r10FFFF>.
+ self assert: ('+2//f/w-' asUnicodeString: 'UTF-7') size = 1.
+ self assert: ('+BDjb/9//-' asUnicodeString: 'UTF-7') last = $<16r10FFFF>.
+ self assert: ('+BDjb/9//-' asUnicodeString: 'UTF-7') size = 2.
+ self assert: ('+BDgEGNv/3/8-' asUnicodeString: 'UTF-7') last =
$<16r10FFFF>.
+ self assert: ('+BDgEGNv/3/8-' asUnicodeString: 'UTF-7') size = 3!
+
+testToUTF7
+ | str |
+ self assert: ((UnicodeString with: $+) asString: 'UTF-7') asString = '+-'.
+ str := UnicodeString with: $<16r418>.
+ self assert: (str asString: 'UTF-7') encoding = 'UTF-7'.
+ self assert: (str asString: 'UTF-7') asString = '+BBg-'.
+ str := str copyWith: $<16r438>.
+ self assert: (str asString: 'UTF-7') asString = '+BBgEOA-'.
+ str := str copyWith: $<16r419>.
+ self assert: (str asString: 'UTF-7') asString = '+BBgEOAQZ-'.
+ str := UnicodeString with: $<16r10FFFF>.
+ self assert: (str asString: 'UTF-7') asString = '+2//f/w-'.
+!
+
+ !
[Prev in Thread] |
Current Thread |
[Next in Thread] |
- [Help-smalltalk] [PATCH] fix utf7,
Paolo Bonzini <=