Source Code of org.mmisw.orrclient.core.util.Utf8Util$ICharsetDetector

package org.mmisw.orrclient.core.util;


import java.io.File;
import java.io.FileInputStream;
import java.io.UnsupportedEncodingException;
import java.nio.ByteBuffer;
import java.nio.CharBuffer;
import java.nio.charset.CharacterCodingException;
import java.nio.charset.Charset;
import java.nio.charset.CharsetDecoder;
import java.nio.charset.CodingErrorAction;
import java.util.Collection;
import java.util.HashSet;
import java.util.Set;


import org.apache.commons.io.IOUtils;
import org.apache.commons.logging.Log;
import org.apache.commons.logging.LogFactory;
import org.mmisw.orrclient.core.util.charset.CharsetDetectorIcu;
import org.mmisw.orrclient.core.util.charset.CharsetDetectorJcd;




/**
 * Utility for detection/verification of UTF-8 charset.
 * 
 * @author Carlos Rueda
 */
public class Utf8Util {
  
  private static final Log log = LogFactory.getLog(Utf8Util.class);
  
  public interface ICharsetDetector {
    
    /**
     * Detects the charsets of the given buffer.
     * 
     * @param bytes the buffer to review.
     * @return the probable charsets
     */
    public Collection<String> detectCharset(byte[] bytes) ;


  }
  
  // 2010-08-16: Setting ICU as the charset detection implementation
  private static ICharsetDetector detector = new CharsetDetectorIcu();
  
  /**
   * Verifies that the given contents are in UTF-8 (or ASCII), throwing an exception if not.
   * 
   * @param bytes the buffer
   * @throws Exception If it's not certain that the contents are in UTF-8 or ASCII; 
   *         the error message will contain an explanation of the problem, including, if possible,
   *         a list of the probable charsets (which may include UTF-8).
   */
  public static void verifyUtf8(byte[] bytes) throws Exception {
    Collection<String> charsets = isUtf8(bytes);
    if ( charsets != null ) {
      throw new Exception("Probable charsets: " +charsets);
    }
  }
  
  /**
   * Is the given buffer in UTF-8 or ASCII?
   * This test is mainly based on java.nio.charset.CharsetDecoder to convert
   * a byte array to string given a certain charset (UTF-8 used here). 
   * If this fails, an exception is immediately thrown. 
   * If not, this method does some further check which may be unnecesary.
   * TODO Under testing.
   * 
   * @param bytes            
   *           the contents to check
   * @return  
   *           null to indicate success, ie., the given buffer is in UTF-8 or ASCII.
   *           
   * @throws Exception  
   *           if conversion to a string assuming UTF-8 generates an exception.
   */
  public static Collection<String> isUtf8(byte[] bytes) throws Exception {
    
    // check it can be decoded assuming UTF-8:
    String str = _utf8toString(bytes);
    // TODO probably, we should just do the above check, and only do the remaining
    // stuff in case we get an exception.
    
    if ( log.isDebugEnabled() ) {
      int len = Math.min(50, str.length());
      log.debug("isUtf8: basic test OK: " +str.subSequence(0, len));
    }
    
    Collection<String> charsets = detector.detectCharset(bytes);
    
    if ( charsets == null || charsets.size() == 0 ) {
      // just return null, so OK.  The following is to drastic a result given that the
      // conversion above was succesful
      return null; // OK
      // NO: throw new Exception("Cannot determine the charset of the given contents");
    }
    
    if ( charsets.contains("UTF-8")  || charsets.contains("ASCII") ) {
      return null; // OK
    }
    
    // some previous version had this check instead of the containment ones above:
//    String charset = charsets.iterator().next();
//    if ( "UTF-8".equalsIgnoreCase(charset) || "ASCII".equalsIgnoreCase(charset) ) {
//      return null; // OK
//    }


    // we give up - return the charsets.
    if ( log.isDebugEnabled() ) {
      log.debug("isUtf8: WARN: basic conversion ok but detected charsets did not include " +
          "UTF-8  or ASCII) !!");
    }
    return charsets;
  }


  
  
  public static void verifyUtf8(File file) throws Exception {
    byte[] bytes = IOUtils.toByteArray(new FileInputStream(file));
    verifyUtf8(bytes);
  }
  
  /**
   * Converts a byte array to a string assuming UTF-8 encoding.
   * 
   * <p>
   * Uses a java.nio.charset.CharsetDecoder to decode the contents assuming UTF-8.
   * This is mainly intended to serve as a first test to verify that a
   * buffer can be read assuming UTF-8.
   * 
   * @param bytes
   * @return
   * @throws CharacterCodingException
   * @throws UnsupportedEncodingException
   */
  private static String _utf8toString(byte[] bytes) throws CharacterCodingException, UnsupportedEncodingException {
    return _byteArrayToString(bytes, "UTF-8");
  }


  /**
   * Converts a byte array to a string using the given encoding.
   * <p>
   * Uses a java.nio.charset.CharsetDecoder.
   */
  private static String _byteArrayToString(byte[] bytes, String charsetName) throws CharacterCodingException, UnsupportedEncodingException {
//    http://www.exampledepot.com/egs/java.nio.charset/ConvertChar.html
    
    // Note that <code>new String(bytes, charsetName)</code> is not useful because: 
    // "The behavior of this constructor when the given bytes are not valid in the 
    // given charset is unspecified."
    
    Charset charset = Charset.forName(charsetName);


    CharsetDecoder decoder = charset.newDecoder();
    decoder
      .onMalformedInput(CodingErrorAction.REPORT)
      .onUnmappableCharacter(CodingErrorAction.REPORT)
    ;
    
    ByteBuffer bbuf = ByteBuffer.wrap(bytes);
    CharBuffer cbuf = decoder.decode(bbuf);
    String str = cbuf.toString();
    return str;
  }
  
  
  /**
   * Detects the charset of a given file.
   * @param args args[0] filename
   * @throws Exception
   */
  public static void main(String[] args) throws Exception {
    String filename;
    if ( args.length > 0 ) {
      filename = args[0];
      System.out.println("File: " +filename);
    }
    else {
//      filename = "resource/utf8/SEACOOS_Revisions(2).csv";
//      filename = "resource/utf8/utf8-theme-windows-1250.owl";
//      filename = "resource/utf8/theme.owl";
      filename = "resource/utf8/with-non-utf8.csv";
      System.out.println("No arg given. Using a hard-coded filename: " +filename);
    }
    
    
    byte[] bytes = IOUtils.toByteArray(new FileInputStream(filename));
    
    Set<String> triedCharsets = new HashSet<String>();
    
    ICharsetDetector[] detectors = new ICharsetDetector[] { new CharsetDetectorIcu(), new CharsetDetectorJcd(), };
    for ( ICharsetDetector detector : detectors ) {
      Collection<String> charsets = detector.detectCharset(bytes);
      System.out.println(detector.getClass().getSimpleName()+ ": Detected charsets: " +charsets);
      
      if ( true ) {
        for ( String charset : charsets ) {
          if ( ! triedCharsets.contains(charset) ) {
            triedCharsets.add(charset);
            System.out.println("  checking that it can be decoded as " +charset);
            try {
              String result = _byteArrayToString(bytes, charset);
              if ( true )
                System.out.println("    Result: [ " +result+ "]");
            }
            catch(Throwable thr) {
              thr.printStackTrace();
            }
          }
        }
      }
    }
    if ( ! triedCharsets.contains("UTF-8") ) {
      String charset = "UTF-8";
      System.out.println("  checking that it can be decoded as " +charset);
      try {
        _byteArrayToString(bytes, charset);
      }
      catch(Throwable thr) {
        thr.printStackTrace();
      }
    }


  }
}
Source Code of org.mmisw.orrclient.core.util.Utf8Util$ICharsetDetector

Related Classes of org.mmisw.orrclient.core.util.Utf8Util$ICharsetDetector