BE THE CODER - nu/xom/xinclude/EncodingHeuristics.java


/* Copyright 2002, 2003, 2005 Elliotte Rusty Harold

   

   This library is free software; you can redistribute it and/or modify

   it under the terms of version 2.1 of the GNU Lesser General Public 

   License as published by the Free Software Foundation.

   

   This library is distributed in the hope that it will be useful,

   but WITHOUT ANY WARRANTY; without even the implied warranty of

   MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the 

   GNU Lesser General Public License for more details.

   

   You should have received a copy of the GNU Lesser General Public

   License along with this library; if not, write to the 

   Free Software Foundation, Inc., 59 Temple Place, Suite 330, 

   Boston, MA 02111-1307  USA

   

   You can contact Elliotte Rusty Harold by sending e-mail to

   [email protected]. Please include the word "XOM" in the

   subject line. The XOM home page is located at http://www.xom.nu/

*/



package nu.xom.xinclude;



import java.io.IOException;

import java.io.InputStream;



/**

 * <p>

 * <code>EncodingHeuristics</code> reads from a stream

 * (which should be buffered) and attempts to guess

 * what the encoding of the text in the stream is.

 * Byte order marks are stripped from the stream.

 * If it fails to determine the type of the encoding,

 * it returns the default UTF-8. 

 * </p>

 *

 *

 * @author Elliotte Rusty Harold

 * @version 1.0

 */

class EncodingHeuristics {



  // No instances allowed

  private EncodingHeuristics() {}



  

  /**

    * <p>

    * This utility method uses a variety of heuristics to

    * attempt to guess the encoding from the initial

    * characters.

    * </p>

    *

    * @param in   <code>InputStream</code> to read from. 

    * @return String  The name of the encoding.

    * @throws IOException if the stream cannot be reset back 

    *      to where it was when the method was invoked.

    */    

    public static String readEncodingFromStream(InputStream in)

      throws IOException {

     

        // This may fail if there are a lot of space 

        // characters before the end of the encoding declaration

        in.mark(1024);

        

        try {

          // Lots of things can go wrong here. If any do,  

          // return "UTF-8" as the default.

            int byte1 = in.read();

            int byte2 = in.read();

            if (byte1 == 0xFE && byte2 == 0xFF) {

                // Don't reset because the byte order mark should not be 

                // included per section 4.3 of the XInclude spec

                return "UnicodeBig";          

            }        

            else if (byte1 == 0xFF && byte2 == 0xFE) {

                // Don't reset because the byte order mark should not be 

                // included per section 4.3 of the XInclude spec

                return "UnicodeLittle";        

            }        

            

            /* In accordance with the Character Model,

               when the text format is a Unicode encoding, the XInclude 

               processor must fail the inclusion when the text in the 

               selected range is non-normalized. When transcoding 

               characters to a Unicode encoding from a legacy encoding,

               a normalizing transcoder must be used. */

                    

            int byte3 = in.read();

            // check for UTF-8 byte order mark

            if (byte1 == 0xEF && byte2 == 0xBB && byte3 == 0xBF) {

                // Don't reset because the byte order mark should not be 

                // included per section 4.3 of the XInclude spec

                return "UTF-8";          

            }

            

            int byte4 = in.read();

            if (byte1 == 0x00 

              && byte2 == 0x00 && byte3 == 0xFE && byte4 == 0xFF) {

                // Don't reset because the byte order mark should not be 

                // included per section 4.3 of the XInclude spec

                // Most Java VMs don't support this next one

                return "UTF32BE";          

            }

            else if (byte1 == 0x00 && byte2 == 0x00 

              && byte3 == 0xFF && byte4 == 0xFE) {

                // Don't reset because the byte order mark should not be 

                // included per section 4.3 of the XInclude spec

                // Most Java VMs don't support this next one

                return "UTF32LE";         

            }

            

            // no byte order mark present; first character must be 

            // less than sign or white space

            // Let's look for less-than signs first

            if (byte1 == 0x00 && byte2 == 0x00 

              && byte3 == 0x00 && byte4 == '<') {

                in.reset();

                return "UTF32BE";          

            }

            else if (byte1 == '<' && byte2 == 0x00 

              && byte3 == 0x00 && byte4 == 0x00) {

                in.reset();

                return "UTF32LE";          

            }

            else if (byte1 == 0x00 && byte2 == '<' 

              && byte3 == 0x00 && byte4 == '?') {

                in.reset();

                return "UnicodeBigUnmarked";          

            }

            else if (byte1 == '<' && byte2 == 0x00 

              && byte3 == '?' && byte4 == 0x00) {

                in.reset();

                return "UnicodeLittleUnmarked";          

            }

            else if (byte1 == '<' && byte2 == '?' 

              && byte3 == 'x' && byte4 == 'm') {

              // ASCII compatible, must read encoding declaration. 

              // 1024 bytes will be far enough to read most 

              // XML declarations

              byte[] data = new byte[1024];

              data[0] = (byte) byte1;

              data[1] = (byte) byte2;

              data[2] = (byte) byte3;

              data[3] = (byte) byte4;

              int length = in.read(data, 4, 1020) + 4;

              // Use Latin-1 (ISO-8859-1) because it's ASCII compatible

              // and all byte sequences are legal Latin-1 sequences 

              // so I don't have to worry about encoding errors if I  

              // slip past the end of the XML/text declaration

              String declaration=new String(data, 0, length, "8859_1");

              // If any of these throw a 

              // StringIndexOutOfBoundsException,

              // we just fall into the catch block and return null

              // since this can't be well-formed XML

              String encoding = findEncodingDeclaration(declaration);

              in.reset();

              return encoding;

              

            }

            else if (byte1 == 0x4C && byte2 == 0x6F 

              && byte3 == 0xA7 && byte4 == 0x94) {

              // EBCDIC compatible, must read encoding declaration 

              byte[] buffer = new byte[1016];

              for (int i = 0; i < buffer.length; i++) {

                  int c = in.read();

                  if (c == -1) break;

                  buffer[i] = (byte) c;

              }

              in.reset();

              // Most EBCDIC encodings are compatible with Cp037 over

              // the range we care about

              return findEncodingDeclaration(new String(buffer, "Cp037"));

            }

        

        }   

        catch (Exception ex) {

            in.reset();

            return "UTF-8";        

        }

        

        // no XML or text declaration present

        in.reset();

        return "UTF-8";

        

    }



    

    private static String findEncodingDeclaration(String declaration)

        throws IOException {

        

          int position = declaration.indexOf("encoding") + 8;

          char c;

          // get rid of white space before equals sign

          while (true) {

              c = declaration.charAt(position++);

              if (c !=' ' && c != '\t' && c != '\r' && c != '\n') {

                  break;

              }

          }

          if (c != '=') { // malformed

              throw new IOException("Couldn't determine encoding");

          }

          // get rid of white space after equals sign

          while (true) {

              c = declaration.charAt(position++);

              if (c !=' ' && c != '\t' && c != '\r' && c != '\n') {

                  break;

              }

          }

          char delimiter = c;

          if (delimiter != '\'' && delimiter != '"') { // malformed

              return "UTF-8";

          }

          // now positioned to read encoding name

          StringBuffer encodingName = new StringBuffer();

          while (true) {

              c = declaration.charAt(position++);

              if (c == delimiter) break;

              encodingName.append(c);

          }

          return encodingName.toString();

          

    }



}
Open Source Repository
Home	/xom/xom-1.1 \| Repository Home
Open Source Repository