Source Code of org.apache.sis.internal.storage.xml.MimeTypeDetector

/*
 * Licensed to the Apache Software Foundation (ASF) under one or more
 * contributor license agreements.  See the NOTICE file distributed with
 * this work for additional information regarding copyright ownership.
 * The ASF licenses this file to You under the Apache License, Version 2.0
 * (the "License"); you may not use this file except in compliance with
 * the License.  You may obtain a copy of the License at
 *
 *     http://www.apache.org/licenses/LICENSE-2.0
 *
 * Unless required by applicable law or agreed to in writing, software
 * distributed under the License is distributed on an "AS IS" BASIS,
 * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 * See the License for the specific language governing permissions and
 * limitations under the License.
 */
package org.apache.sis.internal.storage.xml;


import java.util.Map;
import java.util.HashMap;
import java.util.Arrays;
import java.io.IOException;
import org.apache.sis.storage.DataStoreException;
import org.apache.sis.storage.ProbeResult;
import org.apache.sis.xml.Namespaces;




/**
 * Detects the MIME type of a XML document from the namespace of the root element.
 * This class does not support encoding: it will search only for US-ASCII characters.
 * It does not prevent usage with encodings like ISO-LATIN-1 or UTF-8, provided that
 * the characters in the [32 … 122] range (from space to 'z') are the same and can not
 * be used as part of a multi-byte character.
 *
 * <p>This class tries to implement a lightweight detection mechanism. We can not for instance
 * unmarshall the whole document with JAXB and look at the class of unmarshalled object, since
 * it would be way too heavy.</p>
 *
 * @author  Martin Desruisseaux (Geomatys)
 * @since   0.4
 * @version 0.4
 * @module
 */
abstract class MimeTypeDetector {
    /**
     * The mapping from XML namespace to MIME type.
     * This map shall be read-only, since we do not synchronize it.
     */
    private static final Map<String,String> TYPES = new HashMap<String,String>();
    static {
        TYPES.put(Namespaces.GML, "application/gml+xml");
        TYPES.put(Namespaces.GMD, "application/vnd.iso.19139+xml");
        TYPES.put(Namespaces.CSW, "application/vnd.ogc.csw_xml");
        // More types to be added in future versions.
    }


    /**
     * The {@code "xmlns"} string as a sequence of bytes.
     */
    private static byte[] XMLNS = {'x','m','l','n','s'};


    /**
     * The maximal US-ASCII value, inclusive.
     */
    private static final int MAX_ASCII = 126;


    /**
     * A buffer for reading a word from the XML document, assumed using US-ASCII characters.
     */
    private byte[] buffer = new byte[32];


    /**
     * Number of valid characters in {@link #buffer} string.
     */
    private int length;


    /**
     * Sets to {@code true} when {@link #read()} implementations reached the {@link java.nio.ByteBuffer} limit,
     * but the buffer has enough capacity for more bytes. In such case the {@link #probeContent()} method will
     * return {@link ProbeResult#INSUFFICIENT_BYTES}, which means that the method requests more bytes for
     * detecting the MIME type.
     *
     * @see ProbeResult#INSUFFICIENT_BYTES
     */
    boolean insufficientBytes;


    /**
     * Creates a new instance.
     */
    MimeTypeDetector() {
    }


    /**
     * Adds the given byte in the {@link #buffer}, increasing its capacity if needed.
     */
    private void add(final int c) {
        if (length == buffer.length) {
            buffer = Arrays.copyOf(buffer, length*2);
        }
        buffer[length++] = (byte) c;
    }


    /**
     * Reads a single byte or character, or -1 if we reached the end of the stream portion that we are allowed
     * to read. We are typically not allowed to read the full stream because only a limited amount of bytes is
     * cached.
     *
     * @return The character, or -1 on EOF.
     * @throws IOException if an error occurred while reading the byte or character.
     */
    abstract int read() throws IOException;


    /**
     * Skips all bytes or characters up to {@code search}, then returns the character after it.
     * Characters inside quotes will be ignored.
     *
     * @param  search The byte or character to skip.
     * @return The byte or character after {@code search}, or -1 on EOF.
     * @throws IOException if an error occurred while reading the bytes or characters.
     */
    private int readAfter(final int search) throws IOException {
        int c;
        boolean isQuote = false;
        while ((c = read()) >= 0) {
            if (c == '"') {
                isQuote = !isQuote;
            } else if (c == search && !isQuote) {
                return read();
            }
        }
        return -1;
    }


    /**
     * If the given character is a space, skip it and all following spaces.
     * Returns the first non-space character.
     *
     * <p>For the purpose of this method, a "space" is considered to be the {@code ' '} character
     * and all control characters (character below 32, which include tabulations and line feeds).
     * This is the same criterion than {@link String#trim()}, but is not Unicode spaces.</p>
     *
     * @return The first non-space character, or -1 on EOF.
     * @throws IOException if an error occurred while reading the bytes or characters.
     */
    private int afterSpaces(int c) throws IOException {
        while (c <= ' ' && c >= 0) {
            c = read();
        }
        return c;
    }


    /**
     * Skips the spaces if any, then the given characters, then the spaces, then the given separator.
     * After this method class, the stream position is on the first character after the separator if
     * a match has been found, or after the first unknown character otherwise.
     *
     * @param  word The word to search, as US-ASCII characters.
     * @param  n Number of valid characters in {@code word}.
     * @param  separator The {@code ':'} or {@code '='} character.
     * @return 1 if a match is found, 0 if no match, or -1 on EOF.
     * @throws IOException if an error occurred while reading the bytes or characters.
     */
    private int matches(final byte[] word, final int n, final char separator) throws IOException {
        int c = afterSpaces(read());
        for (int i=0; i<n; i++) {
            if (c != word[i]) {
                return (c >= 0) ? 0 : -1;
            }
            c = read();
        }
        c = afterSpaces(c);
        return (c == separator) ? 1 : (c >= 0) ? 0 : -1;
    }


    /**
     * Returns the MIME type, or {@code null} if unknown.
     *
     * @throws IOException if an error occurred while reading the bytes or characters.
     */
    final String getMimeType() throws IOException {
        if (readAfter('?') != '>') {
            return null;
        }
        /*
         * At this point, we skipped the "<?xml ...?>" header.
         * Find the first < character, skipping comment (if any).
         */
        int c;
        while ((c = readAfter('<')) == '!') {
            do {
                c = readAfter('-');
                while (c == '-') {
                    c = read();
                }
                if (c < 0) {
                    return null;
                }
            } while (c != '>');
        }
        /*
         * At this point, we are after the opening bracket of root element.
         * Skip spaces and read the prefix, which is assumed mandatory.
         */
        c = afterSpaces(c);
        while (c > ' ' && c != ':') {
            if (c == '>' || c > MAX_ASCII) {
                return null;
            }
            add(c);
            c = read();
        }
        /*
         * At this point, we got the prefix of the root element. Skip the ':'
         * character and find the "xmlns" attribute following spaces.
         */
        c = afterSpaces(c);
        if (c != ':') {
            return null;
        }
        while (true) {
            int m = matches(XMLNS, XMLNS.length, ':');
            if (m != 0) {
                if (m < 0) {
                    return null;
                }
                m = matches(buffer, length, '=');
                if (m != 0) {
                    if (m < 0) {
                        return null;
                    }
                    break;
                }
            }
            // Skip everything up to the next space, and check again.
            while ((c = read()) >= ' ');
            if (c < 0) return null;
            continue;
        }
        /*
         * At this point, we found the "xmlns" attribute for the prefix of the root element.
         * Get the attribute value (i.e. the namespace).
         */
        length = 0;
        c = afterSpaces(read());
        if (c != '"') {
            return null;
        }
        c = afterSpaces(read());
        do {
            if (c < 0 || c > MAX_ASCII) {
                return null;
            }
            add(c);
            c = read();
        } while (c != '"');
        /*
         * Done reading the "xmlns" attribute value.
         */
        return TYPES.get(new String(buffer, 0, length, "US-ASCII"));
    }


    /**
     * Wraps the call to {@link #getMimeType()} for catching {@link IOException} and for
     * instantiating the {@link ProbeResult}.
     */
    final ProbeResult probeContent() throws DataStoreException {
        String mimeType;
        try {
            mimeType = getMimeType();
        } catch (IOException e) {
            throw new DataStoreException(e);
        }
        if (mimeType == null) {
            if (insufficientBytes) {
                return ProbeResult.INSUFFICIENT_BYTES;
            }
            mimeType = XMLStoreProvider.MIME_TYPE;
        }
        return new ProbeResult(true, mimeType, null);
    }
}
Source Code of org.apache.sis.internal.storage.xml.MimeTypeDetector

Related Classes of org.apache.sis.internal.storage.xml.MimeTypeDetector