--- classpath-backup/java/nio/charset/CharsetDecoder.java 2003-09-20 11:40:54.000000000 +0100 +++ classpath/java/nio/charset/CharsetDecoder.java 2003-09-21 23:24:47.000000000 +0100 @@ -41,30 +41,198 @@ import java.nio.CharBuffer; /** + * CharsetDecoder provides facilities to decode a sequence of + * bytes (one or more address@hidden ByteBuffer}s) that are in a particular + * charset into a sequence of standard Java 16-bit Unicode characters (one or + * more address@hidden CharBuffer}s). + * + *

The input data are provided in a address@hidden ByteBuffer} or, by invoking + * address@hidden #decode(ByteBuffer in, CharBuffer out, boolean endOfInput)} + * repeatedly and passing a different address@hidden ByteBuffer} as a parameter each + * time, the data can be provided in a sequence of address@hidden ByteBuffer}s. + * + *

The output data are written to a address@hidden CharBuffer} or, by invoking + * address@hidden #decode(ByteBuffer in, CharBuffer out, boolean endOfInput)} + * repeatedly and altering the address@hidden CharBuffer} passed as a parameter each + * time, the data can be written to a sequence of address@hidden CharBuffer}s. + * + *

Methods are normally invoked on a CharsetDecoder in a certain order: + * + *

1. address@hidden #reset()} - invoke if the CharsetDecoder has + * been used before. + * + *

2. address@hidden #decode(ByteBuffer in, CharBuffer out, boolean endOfInput)} + * - invoke repeatedly as long as there are address@hidden ByteBuffer}s to be decoded. + * + *

endOfInput should be false until the + * current invocation of + * address@hidden #decode(ByteBuffer in, CharBuffer out, boolean endOfInput)} is the + * last, i.e., pass true only when there are no more + * data to decode after the current invocation. + * + *

Each invocation of + * address@hidden #decode(ByteBuffer in, CharBuffer out, boolean endOfInput)} decodes + * as many bytes as possible, and returns a address@hidden CoderResult} object to + * describe the reason for returning. The invoker can examine this object to + * determine the reason and act accordingly, e.g., by filling the input + * address@hidden ByteBuffer}. + * + *

3. address@hidden #flush(CharBuffer out)} to ensure that the + * CharsetDecoder writes all data to the output + * address@hidden CharBuffer}. + * + *

If the input address@hidden ByteBuffer} does not contain a valid sequence of + * bytes for the specified address@hidden Charset} the input is malformed + * (address@hidden CoderResult#isMalformed()}). + * + *

If the input address@hidden ByteBuffer} is valid, but there is no equivalent + * Unicode character to map part of it to, the input has an unmappable + * character (address@hidden CoderResult#isUnmappable()}). + * + *

The methods address@hidden #onMalformedInput(CodingErrorAction newAction)} and + * address@hidden #onUnmappableCharacter(CodingErrorAction newAction)} allow an + * action (address@hidden CodingErrorAction}) to be specified for the respective + * decoding errors. + * + *

The actions can be one of: + *

+ * + *

To implement a decoder for a specific charset, create a subclass of + * this class and implement the protected abstract method + * address@hidden #decodeLoop(ByteBuffer in, CharBuffer out)}. + * + *

To ensure that any internal state in a subclass is cleared correctly, + * also override address@hidden #flush(CharBuffer out)} and address@hidden #reset()}, but make + * sure that you invoke super.flush(out) and + * super.reset() in their respective methods. + * + * Instances of this class are NOT threadsafe. + * * @author Jesse Rosenstock + * @author Ricky Clarkson * @since 1.4 */ public abstract class CharsetDecoder { + /** + * Flag meaning that the CharsetDecoder has been reset. + * + *

See address@hidden #state}. + */ private static final int STATE_RESET = 0; + + /** + * Flag meaning that the CharsetDecoder is currently decoding + * data. + * + *

See address@hidden #state}. + */ private static final int STATE_CODING = 1; + + /** + * Flag meaning that the CharsetDecoder has finished decoding + * data. + * + *

See address@hidden #state}. + */ private static final int STATE_END = 2; + + /** + * Flag meaning that the CharsetDecoder has been flushed. + * + *

See address@hidden #state}. + */ private static final int STATE_FLUSHED = 3; + /** + * The default value for address@hidden #replacement} before {link #replaceWith} + * is invoked. + */ private static final String DEFAULT_REPLACEMENT = "\uFFFD"; + /** + * The address@hidden Charset} that instantiated this CharsetDecoder. + */ private final Charset charset; + + /** + * The expected average number of characters output per byte input. + * + *

For example, if 3 bytes correspond to 1 character, this value will be + * equal to 1.0f/3. + */ private final float averageCharsPerByte; + + /** + * The maximum number of characters output per byte. + * + *

For example, if between 2 and 5 bytes correspond to 1 character, this + * value will be 0.5f + */ private final float maxCharsPerByte; + + /** + * The address@hidden String} that will be inserted if a sequence of bytes is + * unmappable (see address@hidden CoderResult#isUnmappable()}). + * Set during construction and also by address@hidden #replaceWith()}. + */ + //Is there any reason that this cannot be set to DEFAULT_REPLACEMENT + //here, and maybe DEFAULT_REPLACEMENT removed? + //Maybe removing DEFAULT_REPLACEMENT would cause Serialization problems. private String replacement; + /** + * The current state of the decoder. One of: + *

+ * At initialization time it is set to address@hidden #STATE_RESET}. + */ private int state = STATE_RESET; + /** + * The action to be taken when malformed input is received. + * At initialization time it is set to + * address@hidden CodingErrorAction#REPORT}, i.e., so that the user of the class is + * informed about any malformed input. + */ private CodingErrorAction malformedInputAction = CodingErrorAction.REPORT; + + /** + * The action to be taken when a character that cannot be mapped to Unicode. + * At initialization time it is set to + * address@hidden CodingErrorAction#REPORT}, i.e., so that the user of the class is + * informed about any unmapped characaters. + */ private CodingErrorAction unmappableCharacterAction = CodingErrorAction.REPORT; + /** + * Real implementation of constructor (the other constructor just invokes + * this one). + * + * The main docs are in the other constructor though, because it will appear + * in most Javadocs. + */ private CharsetDecoder (Charset cs, float averageCharsPerByte, float maxCharsPerByte, String replacement) { @@ -82,22 +250,108 @@ implReplaceWith (replacement); } + /** + * Constructs a CharsetDecoder for the specified + * address@hidden Charset}. + * + *

This is normally only invoked via address@hidden Charset#newDecoder()}. + * + *

The method address@hidden #averageCharsPerByte()} will return the value + * passed as the parameter averageCharsPerByte. + * + *

The method address@hidden #maxCharsPerByte()} will return the value + * passed as the parameter maxCharsPerByte. + * + *

The replacement address@hidden String} will be set to + * "\uFFFD". + * See address@hidden #replacement()}. + * + * @param cs the address@hidden Charset} that instantiated this + * CharsetDecoder. + * + * @param averageCharsPerByte a positive value describing the expected + * average number of characters of output per byte of input. + * + * @param maxCharsPerByte a positive value describing the maximum + * number of characters of output per byte of input. + * + * @throws IllegalArgumentException if averageCharsPerByte + * or maxCharsPerByte are less than or equal to 0. + */ protected CharsetDecoder (Charset cs, float averageCharsPerByte, float maxCharsPerByte) { this (cs, averageCharsPerByte, maxCharsPerByte, DEFAULT_REPLACEMENT); } + /** + * Returns the average number of characters output per byte of input. + * + *

For example, if 3 bytes correspond to 1 character, this value will be + * equal to 1.0f/3. + * + * @return the average number of characters output per byte of input. + */ public final float averageCharsPerByte () { return averageCharsPerByte; } + /** + * Returns the address@hidden Charset} that instantiated this + * CharsetDecoder. + * + * @return the address@hidden Charset} that instantiated this + * CharsetDecoder. + */ public final Charset charset () { return charset; } + /** + * Decodes the input address@hidden ByteBuffer}, allocates a address@hidden CharBuffer} + * for output and places the decoded characters in the output + * address@hidden CharBuffer}. + * + *

This method probably should reset the CharsetDecoder, but + * instead throws an address@hidden IllegalStateException} if the + * CharsetDecoder has not already been reset. Sun's + * documentation conflicts with itself here, so it would be useful to run + * a test to find out what Sun's actual behavior is. + * + *

It then allocates a address@hidden CharBuffer} and populates it with decoded + * data, and invokes address@hidden #flush(CharBuffer out)}. + * + *

This method should not be invoked between any invocations of + * address@hidden #decode(ByteBuffer in, CharBuffer out, boolean endOfInput)} and + * address@hidden #reset()}, and after use address@hidden #reset()} should be invoked on it + * before invoking + * address@hidden #decode(ByteBuffer in, CharBuffer out, boolean endOfInput)}. + * + *

The current algorithm for this uses the value that + * address@hidden #maxCharsPerByte()} returns to allocate storage, so it is not + * optimal in memory usage. + * + * @param in the input address@hidden ByteBuffer}. + * @return the new and populated address@hidden CharBuffer}, which is at position 0 + * and is limited to the size of the data. + * + * @throws IllegalStateException if the CharsetDecoder has not + * been address@hidden #reset()}. + * + * @throws MalformedInputException if the input is not valid for the + * address@hidden Charset} and the malformed input action is + * address@hidden CodingErrorAction#REPORT}. + * + * @throws UnmappableCharacterException if the input contains a character + * for which there is no known mapping to Unicode and the unmappable + * character action is set to address@hidden CodingErrorAction#REPORT}. + * + * @throws CharacterCodingException because + * address@hidden CoderResult#throwException()} is declared to throw this kind of + * exception. + */ public final CharBuffer decode (ByteBuffer in) throws CharacterCodingException { @@ -133,6 +387,52 @@ return out; } + /** + * Decodes the data passed in the input address@hidden ByteBuffer} + * and places the decoded characters in the output address@hidden CharBuffer}. + * + *

Make sure that endOfInput is true if and only if this + * invocation is the last invocation in this sequence of calls to + * address@hidden #decode(ByteBuffer in, CharBuffer out, boolean endOfInput)}. + * + *

Both buffers are used starting at their current positions, i.e., + * rewind() is NOT invoked on either. + * + *

This method will modify the current position, but will not affect the + * marks and limits. + * + *

Returns one of the following: + *

+ * + * @param in a buffer holding the sequence of bytes to decode. + * @param out a buffer to hold the sequence of decoded characters. + * + * @param endOfInput a flag to describe whether the contents of + * in are the last data to be passed to this method. + * + * @return a address@hidden CoderResult} describing why the method returns. + * + * @throws IllegalStateException if the most recent method invocation + * (other than accessors and mutators) was not one of + * address@hidden #reset()} or + * address@hidden #decode(ByteBuffer in, CharBuffer out, boolean endOfInput)} where + * endOfInput is true. + * + * @throws CoderMalfunctionError if address@hidden #decodeLoop} throws a non-checked + * exception. + */ public final CoderResult decode (ByteBuffer in, CharBuffer out, boolean endOfInput) { @@ -188,13 +488,72 @@ } } + /** + * Decodes the bytes in the input address@hidden ByteBuffer} and puts the + * resulting characters in the output address@hidden CharBuffer}. + * + *

The buffers are not rewound (rewind() is not invoked on + * either), and the marks and limits are not set. The positions will be + * modified. + * + *

Returns the same values as + * address@hidden #decode(ByteBuffer in, CharBuffer out, boolean endOfInput)}. + * + * @param in a buffer holding the data to be decoded. + * @param out a buffer to hold the decoded data (characters). + * @return a address@hidden CoderResult} instance describing the reason for + * returning. + */ protected abstract CoderResult decodeLoop (ByteBuffer in, CharBuffer out); + /** + * If a subclass is capable of detecting the address@hidden Charset} based on input + * data, it should override this. + * + * @return the detected address@hidden Charset}. + * + * @throws UnsupportedOperationException if detecting the address@hidden Charset} + * based on input data is not supported (this is the default). + * + * @throws IllegalStateException if not enough data have been processed yet + * to detect the address@hidden Charset}. + */ public Charset detectedCharset () { throw new UnsupportedOperationException (); } - + + /** + * Empties all buffers. + * + *

Some subclasses may need to be notified when decoding has finished, + * so that they can do some cleanup, such as releasing resources, or + * appending some characters to the end of the output. This method does + * some housekeeping of its own, then invokes implFlush, which a subclass may + * override to do this cleanup. + * + *

The method returns one of two values: + *

+ * + *

Note that this method can be invoked directly after address@hidden #reset} + * without + * address@hidden #decode(ByteBuffer in, CharBuffer out, boolean endOfInput)} + * being invoked, so a subclass should probably check to see whether decode + * has been invoked, to decide what output to write. + * + * @param out the address@hidden CharBuffer} to write to. + * @return a address@hidden CoderResult} instance describing the success or failure + * state of the method. + */ public final CoderResult flush (CharBuffer out) { // It seems weird that you can flush after reset, but Sun's javadoc @@ -214,11 +573,34 @@ return implFlush (out); } + /** + * Does any cleanup that subclasses need to do. + * + *

This method is intended to be subclassed to do any cleanup. + * + * See address@hidden #flush(CharBuffer out)}. + * + * @param out the address@hidden CharBuffer} to write to. + * @return a address@hidden CoderResult} instance describing whether the method + * succeeded or failed. + */ protected CoderResult implFlush (CharBuffer out) { return CoderResult.UNDERFLOW; } + /** + * Sets the action to be taken when invalid input is received. + * + *

For details of the available actions, see address@hidden CodingErrorAction}. + * + * @param newAction the action to be taken when invalid input is received. + * + * @return the CharsetDecoder that this method was invoked on. + * + * @throws IllegalArgumentException if newAction is + * null. + */ public final CharsetDecoder onMalformedInput (CodingErrorAction newAction) { if (newAction == null) @@ -229,46 +611,140 @@ return this; } + /** + * Notifies a subclass when the invoker has changed the action to be taken + * when malformed input is received. + * + *

The default implementation does nothing; it is just intended to be + * overridden if required. + * + *

This method is invoked by address@hidden #onMalformedInput}. + * + * @param newAction the action to be taken when malformed input is received. + */ protected void implOnMalformedInput (CodingErrorAction newAction) { // default implementation does nothing } + /** + * Notifies a subclass when the invoker has changed the action to be taken + * when a sequence of bytes cannot be mapped to a Unicode character. + * + *

The default implementation does nothing; it is just intended to be + * overridden if required. + * + *

This method is invoked by address@hidden #onUnmappableCharacter}. + * + * @param newAction the action to be taken when a sequence of bytes is + * received that cannot be mapped to a Unicode character. + */ protected void implOnUnmappableCharacter (CodingErrorAction newAction) { // default implementation does nothing } + /** + * Notifies a subclass when the invoker has changed the replacement + * address@hidden String} (the sequence of characters to be output when a + * character is encountered that cannot be mapped to Unicode in the current + * address@hidden Charset}). + * + *

The default implementation does nothing; it is just intended to be + * overridden if required. + * + *

This method is invoked by address@hidden #replaceWith}. + * + * @param newReplacement the new replacement String. + */ protected void implReplaceWith (String newReplacement) { // default implementation does nothing } + /** + * Notifies a subclass when the invoker has invoked address@hidden #reset()}. + * + *

The default implementation does nothing; it is just intended to be + * overridden if required. + * + *

This method is invoked by address@hidden #reset()}. + */ protected void implReset () { // default implementation does nothing } + /** + * Determines whether this CharsetDecoder can work out how to + * decode data based on the actual data, i.e., whether it can auto-detect + * the charset. + * + *

The default implementation always returns true; subclasses should + * override it if they implement auto-detection. + * + * @return true if this CharsetDecoder is + * auto-detecting, false otherwise. + */ public boolean isAutoDetecting () { return false; } + /** + * Determines whether or not an auto-detecting CharsetDecoder + * has yet detected the address@hidden Charset} of the data. + * + * @return true if the auto-detecting + * CharsetDecoder has detected the address@hidden Charset} of the + * data, false otherwise. Note that a false value + * does not indicate that bytes have not been decoded, just that the + * CharsetDecoder has not detected the address@hidden Charset} of + * the data. + * + * @throws UnsupportedOperationException if the CharsetDecoder + * does not support auto-detection (this is the default). + */ public boolean isCharsetDetected () { throw new UnsupportedOperationException (); } + /** + * Returns the action to be taken when invalid data is received. + * + * @return the action to be taken when invalid data is received. + */ public CodingErrorAction malformedInputAction () { return malformedInputAction; } + /** + * Returns the maximum number of characters output per byte of input. + * + *

For example, if a character of output needs between 2 and 5 bytes, + * then this value will be 0.5f. + * + * @return the maximum number of characters output per byte of input. + */ public final float maxCharsPerByte () { return maxCharsPerByte; } + /** + * Changes the action to be taken when a character is received that does not + * map into Unicode in this address@hidden Charset}. + * + * @param newAction the action to be taken when a character is received that + * does not map into Unicode in this address@hidden Charset}. + * + * @return the CharsetDecoder that this method was invoked on, + * for convenience. + * + * @throws IllegalArgumentException if newAction is null. + */ public final CharsetDecoder onUnmappableCharacter (CodingErrorAction newAction) { @@ -280,11 +756,33 @@ return this; } + /** + * Returns the sequence of characters used to replace a character that + * cannot be mapped to Unicode. + * + *

The address@hidden String} returned always has some content, i.e., it is + * never null and it is never empty. + * + * @return the sequence of characters used to replace a character that + * cannot be mapped to Unicode. + */ public final String replacement () { return replacement; } + /** + * Changes the sequence of characters used to replace a character that + * cannot be mapped to Unicode. + * + * @param newReplacement the new sequence of characters to be used to + * replace characters that cannot be mapped to Unicode. + * + * @return the CharsetDecoder that this method was invoked on. + * + * @throws IllegalArgumentException if newReplacement is + * null or empty (a length of 0). + */ public final CharsetDecoder replaceWith (String newReplacement) { if (newReplacement == null) @@ -298,6 +796,15 @@ return this; } + /** + * Resets this CharsetDecoder, ready for + * address@hidden #decode(ByteBuffer in, CharBuffer out, boolean endOfInput)}. + * + *

Invokes address@hidden #implReset} to ensure that any reset actions in the + * subclass are performed. + * + * @return the CharsetDecoder that this method was invoked on. + */ public final CharsetDecoder reset () { state = STATE_RESET; @@ -305,6 +812,13 @@ return this; } + /** + * Returns the action to be taken when a character is received that cannot + * be represented in Unicode. + * + * @return the action to be taken when a character is received that cannot + * be represented in Unicode. + */ public CodingErrorAction unmappableCharacterAction () { return unmappableCharacterAction;