Package org.apache.shindig.gadgets.encoding

Source Code of org.apache.shindig.gadgets.encoding.EncodingDetector

/*
* Licensed to the Apache Software Foundation (ASF) under one
* or more contributor license agreements.  See the NOTICE file
* distributed with this work for additional information
* regarding copyright ownership.  The ASF licenses this file
* to you under the Apache License, Version 2.0 (the
* "License"); you may not use this file except in compliance
* with the License.  You may obtain a copy of the License at
*
*   http://www.apache.org/licenses/LICENSE-2.0
*
* Unless required by applicable law or agreed to in writing,
* software distributed under the License is distributed on an
* "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
* KIND, either express or implied.  See the License for the
* specific language governing permissions and limitations
* under the License.
*/
package org.apache.shindig.gadgets.encoding;

import java.nio.charset.Charset;

import com.ibm.icu.text.CharsetDetector;
import com.ibm.icu.text.CharsetMatch;

/**
* Attempts to determine the encoding of a given string.
*
* Highly skewed towards common encodings (UTF-8 and Latin-1).
*/
public class EncodingDetector {
  private static final Charset UTF_8 = Charset.forName("UTF-8");
  private static final Charset ISO_8859_1 = Charset.forName("ISO-8859-1");

  /**
   * Returns the detected encoding of the given byte array.
   *
   * @param input The data to detect the encoding for.
   * @param assume88591IfNotUtf8 True to assume that the encoding is ISO-8859-1 (the standard
   *     encoding for HTTP) if the bytes are not valid UTF-8. Only recommended if you can reasonably
   *     expect that other encodings are going to be specified. Full encoding detection is very
   *     expensive!
   * @return The detected encoding.
   */
  public static Charset detectEncoding(byte[] input, boolean assume88591IfNotUtf8) {
    if (looksLikeValidUtf8(input)) {
      return UTF_8;
    }

    if (assume88591IfNotUtf8) {
      return ISO_8859_1;
    }

    // Fall back to the incredibly slow ICU. It might be better to just skip this entirely.
    CharsetDetector detector = new CharsetDetector();
    detector.setText(input);
    CharsetMatch match = detector.detect();
    return Charset.forName(match.getName().toUpperCase());
  }

  /**
   * A pretty good test that something is UTF-8. There are many sequences that will pass here that
   * aren't valid UTF-8 due to the requirement that the shortest possible sequence always be used.
   * We're ok with this behavior because the main goal is speed.
   */
  private static boolean looksLikeValidUtf8(byte[] input) {
    int i = 0;
    if (input.length >= 3 &&
       (input[0] & 0xFF) == 0xEF &&
       (input[1] & 0xFF) == 0xBB &
       (input[2] & 0xFF) == 0xBF) {
      // Skip BOM.
      i = 3;
    }

    int endOfSequence;
    for (int j = input.length; i < j; ++i) {
      int bite = input[i];
      if ((bite & 0x80) == 0) {
        continue; // ASCII
      }

      // Determine number of bytes in the sequence.
      if ((bite & 0x0E0) == 0x0C0) {
        endOfSequence = i + 1;
      } else if ((bite & 0x0F0) == 0x0E0) {
        endOfSequence = i + 2;
      } else if ((bite & 0x0F8) == 0xF0) {
        endOfSequence = i + 3;
      } else {
        // Not a valid utf-8 byte sequence. Skip.
        return false;
      }

      while (i < endOfSequence) {
        i++;
        bite = input[i];
        if ((bite & 0xC0) != 0x80) {
          // High bit not set, not a vlaid sequence
          return false;
        }
      }
    }
    return true;
  }
}
TOP

Related Classes of org.apache.shindig.gadgets.encoding.EncodingDetector

TOP
Copyright © 2018 www.massapi.com. All rights reserved.
All source code are property of their respective owners. Java is a trademark of Sun Microsystems, Inc and owned by ORACLE Inc. Contact coftware#gmail.com.