Examples of com.ibm.icu.text.CharsetMatch

com.ibm.icu.text.CharsetMatch
This class represents a charset that has been identified by a CharsetDetector as a possible encoding for a set of input data. From an instance of this class, you can ask for a confidence level in the charset identification, or for Java Reader or String to access the original byte data in Unicode form.
Instances of this class are created only by CharsetDetectors.
Note: this class has a natural ordering that is inconsistent with equals. The natural ordering is based on the match confidence value. @stable ICU 3.4

        ltrStrBuf = ltrStrBuf.reverse();
        byte [] bytes = ltrStrBuf.toString().getBytes("IBM424");
        
        CharsetDetector det = new CharsetDetector();
        det.setText(bytes);
        CharsetMatch m = det.detect();
        return m;
    }

View Full Code Here

        CharsetDetector detector = new CharsetDetector();
        if (encoding != null) {
            detector.setDeclaredEncoding(encoding);
        }
        detector.setText(in);
        CharsetMatch found = detector.detect();
        result = found.getName();
        LOG.debug("Encoding: " + result);
        return result;
    }

View Full Code Here

            stream = new BufferedInputStream(stream);
        }
    
        detector.setText(stream);
    
        CharsetMatch match = detector.detect();
        if (match == null) {
            throw new TikaException("Unable to detect character encoding");
        }
        
        metadata.set(Metadata.CONTENT_ENCODING, match.getName());
        String language = match.getLanguage();
        if (language != null) {
            metadata.set(Metadata.CONTENT_LANGUAGE, match.getLanguage());
            metadata.set(Metadata.LANGUAGE, match.getLanguage());
        }
        
        return match.getReader();
    }

View Full Code Here

   *             occurs when the detection is failed.
   */
  public static String detectEncoding(byte[] data, String defaultEncoding) throws IOException {
    CharsetDetector detector = new CharsetDetector();
    detector.setText(data);
    CharsetMatch cm = detector.detect();
    String estimatedEncoding = cm.getName();
    boolean isReliable = Charset.isSupported(estimatedEncoding) && cm.getConfidence() >= MINIMAL_CONFIDENCE_LEVEL;
    return isReliable ? estimatedEncoding : defaultEncoding;
  }

View Full Code Here

      //encoding detection
    try {
      BufferedInputStream bis = new BufferedInputStream(new FileInputStream(file));
      CharsetDetector cd = new CharsetDetector();
      cd.setText(bis);
      CharsetMatch cm = cd.detect();
      if (cm != null) {
        format += "; charset=" + cm.getName();
      }
    } catch (IOException e) {
      log.error("Error detecting charset for '{}': {}", fileName, e.getMessage());
    }

View Full Code Here

  public static class FallbackEncodingDetector {
    public Charset detectEncoding(byte[] input) {
      // Fall back to the incredibly slow ICU. It might be better to just skip this entirely.
      CharsetDetector detector = new CharsetDetector();
      detector.setText(input);
      CharsetMatch match = detector.detect();
      return Charset.forName(match.getName().toUpperCase());
    }

View Full Code Here

        // encoding detection
        // FIXME: is this required?
        try (BufferedInputStream bis = new BufferedInputStream(openStream(file))) {
            CharsetDetector cd = new CharsetDetector();
            cd.setText(bis);
            CharsetMatch cm = cd.detect();
            if (cm != null) {
                log.trace("Detected charset {} in {}", cm.getName(), file);
                format += "; charset=" + cm.getName();
            }
            bis.close();
        } catch (IOException e) {
            log.error("Error detecting charset for '{}': {}", fileName, e.getMessage());
        }

View Full Code Here

    }


    // Fall back to the incredibly slow ICU. It might be better to just skip this entirely.
    CharsetDetector detector = new CharsetDetector();
    detector.setText(input);
    CharsetMatch match = detector.detect();
    return match.getName().toUpperCase();
  }

View Full Code Here

  public static class FallbackEncodingDetector {
    public Charset detectEncoding(byte[] input) {
      // Fall back to the incredibly slow ICU. It might be better to just skip this entirely.
      CharsetDetector detector = new CharsetDetector();
      detector.setText(input);
      CharsetMatch match = detector.detect();
      return Charset.forName(match.getName().toUpperCase());
    }

View Full Code Here

        CharsetDetector detector = new CharsetDetector();
        if (encoding != null) {
            detector.setDeclaredEncoding(encoding);
        }
        detector.setText(in);
        CharsetMatch found = detector.detect();
        result = found.getName();
        LOG.debug("Encoding: " + result);
        return result;
    }

View Full Code Here

0 1 2 3

TOP

Related Classes of com.ibm.icu.text.CharsetMatch

com.ibm.icu.dev.demo.charsetdet.DetectingViewer

com.ibm.icu.dev.test.charsetdet.TestCharsetDetector

net.sf.jmatchparser.util.charset.icu4jchardet.ICU4JChardetCharset$Decoder

net.vidageek.crawler.component.WebDownloader

nu.validator.htmlparser.extra.IcuDetectorSniffer

org.apache.marmotta.platform.core.services.importer.ImportWatchServiceImpl

org.apache.maven.doxia.DefaultConverter

org.apache.shindig.gadgets.encoding.EncodingDetector

org.apache.shindig.gadgets.encoding.EncodingDetector$FallbackEncodingDetector

org.apache.stanbol.enhancer.engines.htmlextractor.impl.CharsetRecognizer

All source code are property of their respective owners. Java is a trademark of Sun Microsystems, Inc and owned by ORACLE Inc. Contact coftware#gmail.com.