BE THE CODER - org/apache/velocity/io/UnicodeInputStream.java


package org.apache.velocity.io;



/*

 * Licensed to the Apache Software Foundation (ASF) under one

 * or more contributor license agreements.  See the NOTICE file

 * distributed with this work for additional information

 * regarding copyright ownership.  The ASF licenses this file

 * to you under the Apache License, Version 2.0 (the

 * "License"); you may not use this file except in compliance

 * with the License.  You may obtain a copy of the License at

 *

 *   http://www.apache.org/licenses/LICENSE-2.0

 *

 * Unless required by applicable law or agreed to in writing,

 * software distributed under the License is distributed on an

 * "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY

 * KIND, either express or implied.  See the License for the

 * specific language governing permissions and limitations

 * under the License.

 */





import java.io.IOException;

import java.io.InputStream;

import java.io.PushbackInputStream;



import org.apache.velocity.util.ExceptionUtils;





/**

 * This is an input stream that is unicode BOM aware. This allows you to e.g. read

 * Windows Notepad Unicode files as Velocity templates.

 *

 * It allows you to check the actual encoding of a file by calling {@link #getEncodingFromStream()} on

 * the input stream reader.

 *

 * This class is not thread safe! When more than one thread wants to use an instance of UnicodeInputStream,

 * the caller must provide synchronization.

 *

 * @author <a href="mailto:[email protected]">Aki Nieminen</a>

 * @author <a href="mailto:[email protected]">Henning P. Schmiedehausen</a>

 * @version $Id: UnicodeInputStream.java 685685 2008-08-13 21:43:27Z nbubna $

 * @since 1.5

 */

public class UnicodeInputStream

    extends InputStream

{



    /** BOM Marker for UTF 8. See http://www.unicode.org/unicode/faq/utf_bom.html */

    public static final UnicodeBOM UTF8_BOM = new UnicodeBOM("UTF-8", new byte [] { (byte)0xef, (byte)0xbb, (byte)0xbf });



    /** BOM Marker for UTF 16, little endian. See http://www.unicode.org/unicode/faq/utf_bom.html */

    public static final UnicodeBOM UTF16LE_BOM = new UnicodeBOM("UTF-16LE", new byte [] { (byte)0xff, (byte)0xfe });



    /** BOM Marker for UTF 16, big endian. See http://www.unicode.org/unicode/faq/utf_bom.html */

    public static final UnicodeBOM UTF16BE_BOM = new UnicodeBOM("UTF-16BE", new byte [] { (byte)0xfe, (byte)0xff });



    /**

     * BOM Marker for UTF 32, little endian. See http://www.unicode.org/unicode/faq/utf_bom.html

     *

     * TODO: Does Java actually support this?

     */

    public static final UnicodeBOM UTF32LE_BOM = new UnicodeBOM("UTF-32LE", new byte [] { (byte)0xff, (byte)0xfe, (byte)0x00, (byte)0x00 });



    /**

     * BOM Marker for UTF 32, big endian. See http://www.unicode.org/unicode/faq/utf_bom.html

     *

     * TODO: Does Java actually support this?

     */

    public static final UnicodeBOM UTF32BE_BOM = new UnicodeBOM("UTF-32BE", new byte [] { (byte)0x00, (byte)0x00, (byte)0xfe, (byte)0xff });



    /** The maximum amount of bytes to read for a BOM */

    private static final int MAX_BOM_SIZE = 4;



    /** Buffer for BOM reading */

    private byte [] buf = new byte[MAX_BOM_SIZE];



    /** Buffer pointer. */

    private int pos = 0;



    /** The stream encoding as read from the BOM or null. */

    private final String encoding;



    /** True if the BOM itself should be skipped and not read. */

    private final boolean skipBOM;



    private final PushbackInputStream inputStream;



    /**

     * Creates a new UnicodeInputStream object. Skips a BOM which defines the file encoding.

     *

     * @param  inputStream The input stream to use for reading.

     */

    public UnicodeInputStream(final InputStream inputStream)

            throws IllegalStateException, IOException

    {

        this(inputStream, true);

    }



    /**

     * Creates a new UnicodeInputStream object.

     *

     * @param  inputStream The input stream to use for reading.

     * @param skipBOM If this is set to true, a BOM read from the stream is discarded. This parameter should normally be true.

     */

    public UnicodeInputStream(final InputStream inputStream, boolean skipBOM)

            throws IllegalStateException, IOException

    {

        super();



        this.skipBOM = skipBOM;

        this.inputStream = new PushbackInputStream(inputStream, MAX_BOM_SIZE);



        try

        {

            this.encoding = readEncoding();

        }

        catch (IOException ioe)

        {

            IllegalStateException ex = new IllegalStateException("Could not read BOM from Stream");

            ExceptionUtils.setCause(ex, ioe);

            throw ex;

        }

    }



    /**

     * Returns true if the input stream discards the BOM.

     *

     * @return  True if the input stream discards the BOM.

     */

    public boolean isSkipBOM()

    {

        return skipBOM;

    }



    /**

     * Read encoding based on BOM.

     *

     * @return  The encoding based on the BOM.

     *

     * @throws  IllegalStateException  When a problem reading the BOM occured.

     */

    public String getEncodingFromStream()

    {

        return encoding;

    }



    /**

     * This method gets the encoding from the stream contents if a BOM exists. If no BOM exists, the encoding

     * is undefined.

     *

     * @return The encoding of this streams contents as decided by the BOM or null if no BOM was found.

     */

    protected String readEncoding()

        throws IOException

    {

        pos = 0;



        UnicodeBOM encoding = null;



        // read first byte.

        if (readByte())

        {

            // Build a list of matches

            //

            // 00 00 FE FF --> UTF 32 BE

            // EF BB BF    --> UTF 8

            // FE FF       --> UTF 16 BE

            // FF FE       --> UTF 16 LE

            // FF FE 00 00 --> UTF 32 LE



            switch (buf[0])

            {

            case (byte)0x00: // UTF32 BE

                encoding = match(UTF32BE_BOM, null);

                break;

            case (byte)0xef: // UTF8

                encoding = match(UTF8_BOM, null);

                break;

            case (byte)0xfe: // UTF16 BE

                encoding = match(UTF16BE_BOM, null);

                break;

            case (byte)0xff: // UTF16/32 LE

                encoding = match(UTF16LE_BOM, null);



                if (encoding != null)

                {

                    encoding = match(UTF32LE_BOM, encoding);

                }

                break;



            default:

                encoding = null;

                break;

            }

        }



        pushback(encoding);



        return (encoding != null) ? encoding.getEncoding() : null;

    }



    private final UnicodeBOM match(final UnicodeBOM matchEncoding, final UnicodeBOM noMatchEncoding)

        throws IOException

    {

        byte [] bom = matchEncoding.getBytes();



        for (int i = 0; i < bom.length; i++)

        {

            if (pos <= i) // Byte has not yet been read

            {

                if (!readByte())

                {

                    return noMatchEncoding;

                }

            }



            if (bom[i] != buf[i])

            {

                return noMatchEncoding;

            }

        }



        return matchEncoding;

    }



    private final boolean readByte()

            throws IOException

    {

        int res = inputStream.read();

        if (res == -1)

        {

            return false;

        }



        if (pos >= buf.length)

        {

            throw new IOException("BOM read error");

        }



        buf[pos++] = (byte) res;

        return true;

    }



    private final void pushback(final UnicodeBOM matchBOM)

        throws IOException

    {

        int count = pos; // By default, all bytes are pushed back.

        int start = 0;



        if (matchBOM != null && skipBOM)

        {

            // We have a match (some bytes are part of the BOM)

            // and we want to skip the BOM. Push back only the bytes

            // after the BOM.

            start = matchBOM.getBytes().length;

            count = (pos - start);



            if (count < 0)

            {

                throw new IllegalStateException("Match has more bytes than available!");

            }

        }



        inputStream.unread(buf, start, count);

    }



    /**

     * @see java.io.InputStream#close()

     */

    public void close()

        throws IOException

    {

        inputStream.close();

    }



    /**

     * @see java.io.InputStream#available()

     */

    public int available()

        throws IOException

    {

        return inputStream.available();

    }



    /**

     * @see java.io.InputStream#mark(int)

     */

    public void mark(final int readlimit)

    {

        inputStream.mark(readlimit);

    }



    /**

     * @see java.io.InputStream#markSupported()

     */

    public boolean markSupported()

    {

        return inputStream.markSupported();

    }



    /**

     * @see java.io.InputStream#read()

     */

    public int read()

        throws IOException

    {

        return inputStream.read();

    }



    /**

     * @see java.io.InputStream#read(byte[])

     */

    public int read(final byte [] b)

        throws IOException

    {

        return inputStream.read(b);

    }



    /**

     * @see java.io.InputStream#read(byte[], int, int)

     */

    public int read(final byte [] b, final int off, final int len)

        throws IOException

    {

        return inputStream.read(b, off, len);

    }



    /**

     * @see java.io.InputStream#reset()

     */

    public void reset()

        throws IOException

    {

        inputStream.reset();

    }



    /**

     * @see java.io.InputStream#skip(long)

     */

    public long skip(final long n)

        throws IOException

    {

        return inputStream.skip(n);

    }



    /**

     * Helper class to bundle encoding and BOM marker.

     *

     * @author <a href="mailto:[email protected]">Henning P. Schmiedehausen</a>

     * @version $Id: UnicodeInputStream.java 685685 2008-08-13 21:43:27Z nbubna $

     */

    static final class UnicodeBOM

    {

        private final String encoding;



        private final byte [] bytes;



        private UnicodeBOM(final String encoding, final byte [] bytes)

        {

            this.encoding = encoding;

            this.bytes = bytes;

        }



        String getEncoding()

        {

            return encoding;

        }



        byte [] getBytes()

        {

            return bytes;

        }

    }

}
Open Source Repository
Home	/velocity/velocity-1.6.4 \| Repository Home
Open Source Repository