Examples of enableInputFilter()


Examples of com.ibm.icu.text.CharsetDetector.enableInputFilter()

        }

        // the author didn't tell us the encoding, try the mozilla-heuristic
        if (charset == null) {
          CharsetDetector det = new CharsetDetector();
          det.enableInputFilter(true);
          InputStream detStream = new BufferedInputStream(sourceStream);
          det.setText(detStream);
          charset = det.detect().getName();
          sourceStream = detStream;
        }
View Full Code Here

Examples of com.ibm.icu.text.CharsetDetector.enableInputFilter()

        String s = "<a> <lot> <of> <English> <inside> <the> <markup> Un tr\u00E8s petit peu de Fran\u00E7ais. <to> <confuse> <the> <detector>";
        byte[] bytes = s.getBytes("ISO-8859-1");
        CharsetDetector det = new CharsetDetector();
        CharsetMatch m;
       
        det.enableInputFilter(true);
        if (!det.inputFilterEnabled()){
            errln("input filter should be enabled");
        }
       
        det.setText(bytes);
View Full Code Here

Examples of com.ibm.icu.text.CharsetDetector.enableInputFilter()

       
        if (! m.getLanguage().equals("fr")) {
            errln("input filter did not strip markup!");
        }
       
        det.enableInputFilter(false);
        det.setText(bytes);
        m = det.detect();
       
        if (! m.getLanguage().equals("en")) {
            errln("unfiltered input did not detect as English!");
View Full Code Here

Examples of com.ibm.icu.text.CharsetDetector.enableInputFilter()

        }

        // the author didn't tell us the encoding, try the mozilla-heuristic
        if (charset == null) {
            final CharsetDetector det = new CharsetDetector();
            det.enableInputFilter(true);
            final InputStream detStream = new BufferedInputStream(sourceStream);
            det.setText(detStream);
            charset = det.detect().getName();
            sourceStream = detStream;
        }
View Full Code Here

Examples of com.ibm.icu.text.CharsetDetector.enableInputFilter()

      // charset detection with icu
      try {
        bis = new BufferedInputStream(new FileInputStream(tempFile));
        CharsetDetector detector;
        detector = new CharsetDetector();
        detector.enableInputFilter(true);
        detector.setText(bis);
        if (declaredEncoding!=null && !"".equals(declaredEncoding))
          detector.setDeclaredEncoding(declaredEncoding);
        CharsetMatch[] matches = null;
        matches = detector.detectAll();
View Full Code Here

Examples of com.ibm.icu.text.CharsetDetector.enableInputFilter()

      // charset detection with icu
      bis = new BufferedInputStream(new FileInputStream(tempFile));
      CharsetDetector detector;
      detector = new CharsetDetector();
      detector.enableInputFilter(true);
      detector.setText(bis);
      if (declaredEncoding!=null && !"".equals(declaredEncoding))
        detector.setDeclaredEncoding(declaredEncoding);
      CharsetMatch[] matches = detector.detectAll();
      bis.close();
View Full Code Here

Examples of com.ibm.icu.text.CharsetDetector.enableInputFilter()

    @Override
    protected CoderResult implFlush(CharBuffer out) {
      if (usedDecoder == null) {
        CharsetDetector detector = new CharsetDetector();
        detector.enableInputFilter(filtered);
        byte[] data = buffer.toByteArray();
        detector.setText(data);
        CharsetMatch cm = detector.detect();
        try {
          usedDecoder = Charset.forName(cm == null ? "ISO-8859-1" : cm.getName()).newDecoder();
View Full Code Here

Examples of org.apache.tika.parser.txt.CharsetDetector.enableInputFilter()

public class TikaEncodingDetector implements EncodingDetector {

    public String guessEncoding(InputStream is) throws IOException {
        CharsetDetector charsetDetector = new CharsetDetector();
        charsetDetector.setText( is instanceof BufferedInputStream ? is : new BufferedInputStream(is) );
        charsetDetector.enableInputFilter(true);
        CharsetMatch cm = charsetDetector.detect();
        return cm.getName();
    }

}
View Full Code Here

Examples of org.apache.tika.parser.txt.CharsetDetector.enableInputFilter()

            detector.setDeclaredEncoding(incomingCharset);
        }

        // TIKA-341 without enabling input filtering (stripping of tags) the
        // short HTML tests don't work well.
        detector.enableInputFilter(true);
        detector.setText(stream);
        for (CharsetMatch match : detector.detectAll()) {
            if (Charset.isSupported(match.getName())) {
                metadata.set(Metadata.CONTENT_ENCODING, match.getName());
View Full Code Here

Examples of org.apache.tika.parser.txt.CharsetDetector.enableInputFilter()

            detector.setDeclaredEncoding(incomingCharset);
        }

        // TIKA-341 without enabling input filtering (stripping of tags) the
        // short HTML tests don't work well.
        detector.enableInputFilter(true);
        detector.setText(stream);
        for (CharsetMatch match : detector.detectAll()) {
            if (Charset.isSupported(match.getName())) {
                metadata.set(Metadata.CONTENT_ENCODING, match.getName());
View Full Code Here
TOP
Copyright © 2018 www.massapi.com. All rights reserved.
All source code are property of their respective owners. Java is a trademark of Sun Microsystems, Inc and owned by ORACLE Inc. Contact coftware#gmail.com.