Source Code of org.vfny.geoserver.util.requests.XmlCharsetDetector

package org.vfny.geoserver.util.requests;


import java.io.InputStream;
import java.io.BufferedInputStream;
import java.io.InputStreamReader;
import java.io.Reader;
import java.io.StringWriter;
import java.io.Writer;
import java.io.IOException;
import java.io.UnsupportedEncodingException;


import java.nio.charset.UnsupportedCharsetException;


import java.util.Locale;


import java.util.logging.Logger;
import java.util.logging.Level;


import java.util.regex.Pattern;
import java.util.regex.Matcher;


import org.vfny.geoserver.util.requests.RewindableInputStream;
import org.vfny.geoserver.util.requests.EncodingInfo;
import org.vfny.geoserver.util.requests.readers.UCSReader;


/**
 * Provides a methods that can be used to detect charset of some
 * XML document and (optionally) return a reader that is aware of
 * this charset and can correctly decode document's data.
 */
public class XmlCharsetDetector {


    protected static Logger LOGGER = Logger.getLogger(
            "org.vfny.geoserver.requests");


    /**
     * In current context naming this "GT", "GREATER_THAN" or like
     * would be misleading.
     */
    private static final char RIGHT_ANGLE_BRACKET = '\u003E';


    private static final Pattern ENCODING_PATTERN =
            Pattern.compile("encoding\\s*\\=\\s*\"([^\"]+)\"");


    /**
     * Maximum number of characters we are expecting in XML Declaration.
     * There are probably will be less then 100, but just in case...
     */
    private static final int MAX_XMLDECL_SIZE = 100;




    /**
     * Based on Xerces-J code, this method will try its best to return a
     * reader which is able to decode content of incoming XML document
     * properly. To achieve this goal, it first infers general
     * encoding scheme of the above document and then uses this
     * information to extract actual charset from XML declaration. In
     * any recoverable error situation default UTF-8 reader will be
     * created.
     *
     * @param istream Byte stream (most probably obtained with
     *                <code>HttpServletRequest.getInputStream</code>
     *                that gives access to XML document in question).
     *
     * @param encInfo Instance of EncodingInfo where information about
     *                detected charset will be stored. You can then
     *                use it, for example, to form a response encoded
     *                with this charset.
     *
     * @throws IOException in case of any unrecoverable I/O errors.
     * @throws UnsupportedCharsetException <code>InputStreamReader</code>'s
     *             constructor will probably throw this exception if
     *             inferred charset of XML document is not supported by
     *             current JVM.
     */
    public static Reader getCharsetAwareReader(InputStream istream,
                                               EncodingInfo encInfo)
            throws
                IOException,
                UnsupportedCharsetException {


        RewindableInputStream stream;
        stream = new RewindableInputStream(istream, false);


       //
       // Phase 1. Reading first four bytes and determining encoding scheme.
        final byte[] b4 = new byte[4];


        int count = 0;
        for (; count < 4; count++) {
            int b = stream.read();
            if (-1 != b) {
                b4[count] = (byte) b;
            } else {
                break;
            }
        }


        if (LOGGER.isLoggable(Level.FINE)) {
           // Such number of concatenating strings makes me sick.
           // But using StringBuffer will make this uglier, not?
            LOGGER.fine("First 4 bytes of XML doc are : "
                + Integer.toHexString((int) b4[0] & 0xff).toUpperCase()
                 + " ('" + (char) b4[0] + "') "
                + Integer.toHexString((int) b4[1] & 0xff).toUpperCase()
                 + " ('" + (char) b4[1] + "') "
                + Integer.toHexString((int) b4[2] & 0xff).toUpperCase()
                 + " ('" + (char) b4[2] + "') "
                + Integer.toHexString((int) b4[3] & 0xff).toUpperCase()
                 + " ('" + (char) b4[3] + "')"
            );
        }


        /*
         * `getEncodingName()` is capable of detecting following encoding
         * schemes:
         * "UTF-8", "UTF-16LE", "UTF-16BE", "ISO-10646-UCS-4",
         * or "CP037". It cannot distinguish between UTF-16 (without BOM)
         * and "ISO-10646-UCS-2", so latter will be interpreted as UTF-16
         * for the purpose of reading XML declaration. There shouldn't be
         * much trouble though as (I believe) these formats are identical for
         * the Basic Multilingual Plane, except that UTF-16-encoded text
         * can contain values from surrogate range and valid UCS-2 input
         * cannot (imho).
         * This ugly form of copying charset data is required to maintain
         * "reference integrity" of encInfo variable. As it can be possibly
         * used after this method call, it should point to the same memory
         * structure, and assignment or cloning doesn't work for me there.
         */
        encInfo.copyFrom(getEncodingName(b4, count));


        if (LOGGER.isLoggable(Level.FINE)) {
            LOGGER.fine("Charset detection phase 1. Inferred encoding: " +
                        encInfo.toString());
        }


       // Rewinding to beginning of data
        stream.reset();


        String ENCODING     = encInfo.getEncoding().toUpperCase(Locale.ENGLISH);
        Boolean isBigEndian = encInfo.isBigEndian();
        boolean hasBOM      = encInfo.hasBOM();


        /*
         * Special case UTF-8 files with BOM created by Microsoft
         * tools. It's more efficient to consume the BOM than make
         * the reader perform extra checks. -Ac
         */
        if (hasBOM && ENCODING.equals("UTF-8")) {
           // ignore first three bytes...
            stream.skip(3);
        }


        /*
         * The specifics of `getEncodingName` work is that it always returns
         * UTF-16 with BOM as either UTF-16LE or UTF-16BE, and
         * InputStreamReader doesn't expect BOM coming with UTF-16LE|BE
         * encoded data. So this BOM should also be removed, if present.
         */
        if (count > 1 && (ENCODING.equals("UTF-16LE") ||
                          ENCODING.equals("UTF-16BE"))) {
            int b0 = b4[0] & 0xFF;
            int b1 = b4[1] & 0xFF;
            if ((b0 == 0xFF && b1 == 0xFE) || (b0 == 0xFE && b1 == 0xFF)) {
               // ignore first two bytes...
                stream.skip(2);
            }
        }




        Reader reader = null;


        /*
         * We must use custom class to read UCS-4 data, my JVM doesn't support
         * this encoding scheme by default and I doubt other JVMs are.
         *
         * There was another specific reader for UTF-8 encoding in Xerces
         * (org.apache.xerces.impl.io.UTF8Reader), which they say is
         * optimized one. May be it is really better than JVM's default
         * decoding algorithm but I doubt the necessity of porting just
         * another (not so small) class in order to "efficiently" extract
         * a couple of chars from XML declaration. Still I may be mistaking
         * there. Moreover, Xerces' UTF8Reader has some internal dependencies
         * and it will take much more effort to extract it from there.
         *
         * Also, at this stage it is quite impossible to have "ISO-10646-UCS-2"
         * as a value for ENCODING.
         *
         * You can avoid possible bugs in UCSReader by commenting out this
         * block of code together with following `if`. Then you will get an
         * UnsupportedEncodingException for UCS-4 encoded data.
         */
        if ("ISO-10646-UCS-4".equals(ENCODING)) {


            if (null != isBigEndian) {
                boolean isBE = isBigEndian.booleanValue();
                if (isBE) {
                    reader = new UCSReader(stream, UCSReader.UCS4BE);
                } else {
                    reader = new UCSReader(stream, UCSReader.UCS4LE);
                }
            } else {
               // Fatal error, UCSReader will fail to decode this properly
                String s = "Unsupported byte order for ISO-10646-UCS-4 encoding.";
                throw new UnsupportedCharsetException(s);
            }


        }


        if (null == reader) {
            reader = new InputStreamReader(stream, ENCODING);
        }


       //
       // Phase 2. Reading XML declaration and extracting charset info from it.
        String declEncoding = getXmlEncoding(reader);


        if (LOGGER.isLoggable(Level.FINE)) {
            LOGGER.fine("Charset detection phase 2. Charset in XML declaration "
                       + "is `" + declEncoding + "`.");
        }


        stream.reset();


        /*
         * Now RewindableInputStream is allowed to return more than one byte
         * per read operation. It also will not buffer bytes read using
         * `read(byte[], int, int)` method.
         */
        stream.setChunkedMode(true);


        /*
         * Reusing existing reader if possible, creating new one only if
         * declared charset name differs from guessed one
         */
        if (null != declEncoding && !declEncoding.equals(ENCODING)) {
           /*
            * I believe that for UCS-2 encoding default UTF-16 reader
            * (which is already created at this time) should suffice
            * in most cases. Though, we can always construct a new
            * UCSReader instance, if I am wrong here.
            */
            if (!declEncoding.equals("ISO-10646-UCS-2")) {
                if (LOGGER.isLoggable(Level.FINE)) {
                    LOGGER.fine("Declared charset differs from inferred one. "
                               + "Trying to construct InputStreamReader for `"
                               + declEncoding + "`.");
                }
                reader = new InputStreamReader(stream, declEncoding);
                encInfo.setEncoding(declEncoding);
            }
        }


        return reader;


    } // END getCharsetAwareReader(InputStream) : Reader




    /**
     * Use this variant when you aren't interested in encoding data, and just
     * want to get a suitable reader for incoming request.
     *
     * @param istream See <code>getCharsetAwareReader(InputStream,
     *                              EncodingInfo)</code>.
     *
     */
    public static Reader getCharsetAwareReader(InputStream istream)
            throws
                IOException,
                UnsupportedCharsetException {
        return getCharsetAwareReader(istream, new EncodingInfo());
    }




    /**
     * Creates a new reader on top of the given <code>InputStream</code> using
     * existing (external) encoding information. Unlike
     * <code>getCharsetAwareReader</code>, this method never tries to detect
     * charset or encoding scheme of <code>InputStream</code>'s data. This also
     * means that it <em>must</em> be provided with valid
     * <code>EncodingInfo</code> instance, which may be obtained, for example,
     * from previous <code>getCharsetAwareReader(InputStream, EncodingInfo)</code>
     * call.
     *
     * @param istream byte-stream containing textual (presumably XML) data
     * @param encInfo correctly initialized object which holds information of
     *                the above byte-stream's contents charset.
     *
     * @throws IllegalArgumentException      if charset name is not specified
     * @throws UnsupportedEncodingException  in cases when specified charset is
     *             not supported by platform or due to invalid byte order for
     *             <code>ISO-10646-UCS-2|4</code> charsets.
     *
     */
    public static Reader createReader(InputStream istream,
                                      EncodingInfo encInfo)
            throws
                IllegalArgumentException,
                UnsupportedEncodingException {


        String charset = encInfo.getEncoding();
        Boolean isBigEndian = encInfo.isBigEndian();


       // We MUST know encoding (in fact, charset) name, and as EncodingInfo
       // have non-arg constructor, its `getEncoding` can return null.
        if (null == charset) {
            String s = "Name of the charset must not be NULL!";
            throw new IllegalArgumentException(s);
        }


        if (LOGGER.isLoggable(Level.FINE)) {
            LOGGER.fine("Trying to create reader basing on existing charset "
                       + "information: `" + encInfo + "`.");
        }


        Reader reader = null;


       // UCS-2|4 charsets are handled with custom reader
        if ("ISO-10646-UCS-4".equals(charset)) {


            if (null != isBigEndian) {
                boolean isBE = isBigEndian.booleanValue();
                if (isBE) {
                    reader = new UCSReader(istream, UCSReader.UCS4BE);
                } else {
                    reader = new UCSReader(istream, UCSReader.UCS4LE);
                }


            } else {
               // Fatal error, UCSReader will fail to decode this properly
                String s = "Unsupported byte order for ISO-10646-UCS-4 encoding.";
                throw new UnsupportedEncodingException(s);
            }


        } else if ("ISO-10646-UCS-2".equals(charset)) {


            if (null != isBigEndian) {
                boolean isBE = isBigEndian.booleanValue();
                if (isBE) {
                    reader = new UCSReader(istream, UCSReader.UCS4BE);
                } else {
                    reader = new UCSReader(istream, UCSReader.UCS4LE);
                }
            } else {
               // Cannot construct UCSReader without byte order info
                String s = "Byte order must be specified for ISO-10646-UCS-2.";
                throw new UnsupportedEncodingException(s);
            }


        } else {
            reader = new InputStreamReader(istream, charset);
        }


        return reader;


    } // END createReader(InputStream, EncodingInfo) : Reader




    /**
     * Returns the IANA encoding name that is auto-detected from
     * the bytes specified, with the endian-ness of that encoding where
     * appropriate. Note, that encoding obtained this way is only an
     * <em>encoding scheme</em> of the request, i.e. step 1 of detection
     * process. To learn the exact <em>charset</em> of the request data,
     * you should also perform step 2 - read XML declaration and get the
     * value of its <code>encoding</code> pseudoattribute.
     *
     * @param b4    The first four bytes of the input.
     * @param count The number of bytes actually read.
     * @return Instance of EncodingInfo incapsulating all encoding-related data.
     */
    protected static EncodingInfo getEncodingName(byte[] b4, int count) {


        if (count < 2) {
            return new EncodingInfo("UTF-8", null);
        }


        // UTF-16, with BOM
        int b0 = b4[0] & 0xFF;
        int b1 = b4[1] & 0xFF;
        if (b0 == 0xFE && b1 == 0xFF) {
            // UTF-16, big-endian
            return new EncodingInfo("UTF-16BE", new Boolean(true), true);
        }
        if (b0 == 0xFF && b1 == 0xFE) {
            // UTF-16, little-endian
            return new EncodingInfo("UTF-16LE", new Boolean(false), true);
        }


        // default to UTF-8 if we don't have enough bytes to make a
        // good determination of the encoding
        if (count < 3) {
            return new EncodingInfo("UTF-8", null);
        }


        // UTF-8 with a BOM
        int b2 = b4[2] & 0xFF;
        if (b0 == 0xEF && b1 == 0xBB && b2 == 0xBF) {
            return new EncodingInfo("UTF-8", null, true);
        }


        // default to UTF-8 if we don't have enough bytes to make a
        // good determination of the encoding
        if (count < 4) {
            return new EncodingInfo("UTF-8", null);
        }


        // other encodings
        int b3 = b4[3] & 0xFF;


        if (b0 == 0x00 && b1 == 0x00 && b2 == 0x00 && b3 == 0x3C) {
            // UCS-4, big endian (1234)
            return new EncodingInfo("ISO-10646-UCS-4", new Boolean(true));
        }
        if (b0 == 0x3C && b1 == 0x00 && b2 == 0x00 && b3 == 0x00) {
            // UCS-4, little endian (4321)
            return new EncodingInfo("ISO-10646-UCS-4", new Boolean(false));
        }
        if (b0 == 0x00 && b1 == 0x00 && b2 == 0x3C && b3 == 0x00) {
            // UCS-4, unusual octet order (2143)
            // REVISIT: What should this be? (Currently this would be
            // an exception :)
            return new EncodingInfo("ISO-10646-UCS-4", null);
        }
        if (b0 == 0x00 && b1 == 0x3C && b2 == 0x00 && b3 == 0x00) {
            // UCS-4, unusual octect order (3412)
            // REVISIT: What should this be?
            return new EncodingInfo("ISO-10646-UCS-4", null);
        }
        if (b0 == 0x00 && b1 == 0x3C && b2 == 0x00 && b3 == 0x3F) {
            // UTF-16, big-endian, no BOM
            // (or could turn out to be UCS-2...
            // REVISIT: What should this be?
            return new EncodingInfo("UTF-16BE", new Boolean(true));
        }
        if (b0 == 0x3C && b1 == 0x00 && b2 == 0x3F && b3 == 0x00) {
            // UTF-16, little-endian, no BOM
            // (or could turn out to be UCS-2...
            return new EncodingInfo("UTF-16LE", new Boolean(false));
        }
        if (b0 == 0x4C && b1 == 0x6F && b2 == 0xA7 && b3 == 0x94) {
            // EBCDIC
            // a la xerces1, return CP037 instead of EBCDIC here
            return new EncodingInfo("CP037", null);
        }


        // default encoding
        return new EncodingInfo("UTF-8", null);


    } // END getEncodingName(byte[], int) : EncodingInfo






    /**
     * Gets the encoding of the xml request made to the dispatcher.  This
     * works by reading the temp file where we are storing the request,
     * looking to match the header specified encoding that should be present
     * on all xml files.  This call should only be made after the temp file
     * has been set.  If no encoding is found, or if an IOError is encountered
     * then null shall be returned.
     *
     * @param reader This character stream is supposed to contain XML data
     *               (i.e. it should start with valid XML declaration).
     *
     * @return The encoding specified in the xml header read from the supplied
     *         character stream.
     */
    protected static String getXmlEncoding(Reader reader) {


        try {


            StringWriter sw = new StringWriter(MAX_XMLDECL_SIZE);


            int c;
            int count = 0;
            for (; (6 > count) && (-1 != (c = reader.read())); count++) {
                sw.write(c);
            }


           /*
            * Hmm, checking for the case when there is no XML declaration and
            * document begins with processing instruction whose target name
            * starts with "<?xml" ("<?xmlfoo"). Sounds like a nearly impossible
            * thing, but Xerces guys are checking for that somewhere in the
            * depths of their code :)
            */
            if ((6 > count) || (!"<?xml ".equals(sw.toString()))) {
                if (LOGGER.isLoggable(Level.WARNING)) {
                    LOGGER.warning("Invalid(?) XML declaration: "
                                  + sw.toString() + ".");
                }
                return null;
            }


            /*
             * Continuing reading declaration(?) til the first '>' ('\u003E')
             * encountered. Conversion from `int` to `char` should be safe
             * for our purposes, at least I'm not expecting any extended
             * (0x10000+) characters in xml declaration. I also limited
             * the total number of chars read this way to prevent any
             * malformed (no '>') input potentially forcing us to read
             * megabytes of useless data :)
             */
            for (; (MAX_XMLDECL_SIZE > count)
                   && (-1 != (c = reader.read()))
                   && (RIGHT_ANGLE_BRACKET != (char) c); count++) {
                sw.write(c);
            }


            Matcher m = ENCODING_PATTERN.matcher(sw.toString());
            if (m.find()) {
                String result = m.group(1);
                return result;
            } else {
                return null;
            }


        } catch (IOException e) {
            if (LOGGER.isLoggable(Level.WARNING)) {
                LOGGER.warning("Failed to extract charset info from XML "
                        + "declaration due to IOException: " + e.getMessage());
            }
            return null;
        }


    } // END getXmlEncoding(Reader) : String


} // END class XmlCharsetDetector
Source Code of org.vfny.geoserver.util.requests.XmlCharsetDetector

Related Classes of org.vfny.geoserver.util.requests.XmlCharsetDetector