Examples of com.ibm.icu.text.CharsetMatch

com.ibm.icu.text.CharsetMatch
This class represents a charset that has been identified by a CharsetDetector as a possible encoding for a set of input data. From an instance of this class, you can ask for a confidence level in the charset identification, or for Java Reader or String to access the original byte data in Unicode form.
Instances of this class are created only by CharsetDetectors.
Note: this class has a natural ordering that is inconsistent with equals. The natural ordering is based on the match confidence value. @stable ICU 3.4

  }


  public String autoDetectEncoding(byte[] bytes) {
    CharsetDetector cd = new CharsetDetector();
    cd.setText(bytes);
    CharsetMatch charsetMatch = cd.detect();
    String charSet = charsetMatch.getName();


    int confidence = charsetMatch.getConfidence();
    logger.info("CharsetMatch: {} ({}% confidence)", charSet, confidence);
    setSelectedItem(charSet);
    return charSet;
  }

View Full Code Here

            stream = new BufferedInputStream(stream);
        }
    
        detector.setText(stream);
    
        CharsetMatch match = detector.detect();
        if (match == null) {
            throw new TikaException("Unable to detect character encoding");
        }
        
        metadata.set(Metadata.CONTENT_ENCODING, match.getName());
        String language = match.getLanguage();
        if (language != null) {
            metadata.set(Metadata.CONTENT_LANGUAGE, match.getLanguage());
            metadata.set(Metadata.LANGUAGE, match.getLanguage());
        }
        
        return match.getReader();
    }

View Full Code Here

            }


            is = new BufferedInputStream( new FileInputStream( f ) );
            CharsetDetector detector = new CharsetDetector();
            detector.setText( is );
            CharsetMatch match = detector.detect();


            return match.getName().toUpperCase( Locale.ENGLISH );
        }
        catch ( IOException e )
        {
            // nop
        }

View Full Code Here

    }


    // Fall back to the incredibly slow ICU. It might be better to just skip this entirely.
    CharsetDetector detector = new CharsetDetector();
    detector.setText(input);
    CharsetMatch match = detector.detect();
    return Charset.forName(match.getName().toUpperCase());
  }

View Full Code Here

  public static class FallbackEncodingDetector {
    public Charset detectEncoding(byte[] input) {
      // Fall back to the incredibly slow ICU. It might be better to just skip this entirely.
      CharsetDetector detector = new CharsetDetector();
      detector.setText(input);
      CharsetMatch match = detector.detect();
      return Charset.forName(match.getName().toUpperCase());
    }

View Full Code Here

        }


        if (Status.OK.equals(status)) {
          CharsetDetector detector = new CharsetDetector();
          detector.setText(read(response.getEntity().getContent()));
          CharsetMatch match = detector.detect();


          log.debug("Detected charset: " + match.getName());


          String content = match.getString();
          CharBuffer buffer = CharBuffer.wrap(content.toCharArray());
          Charset utf8Charset = Charset.forName("UTF-8");
          String utf8Content = new String(utf8Charset.encode(buffer).array(), "UTF-8");


          return new OkPage(url, utf8Content);

View Full Code Here

        JMenuItem menuItem;
        
        menu.removeAll();
        
        for (int i = 0; i < matches.length; i += 1) {
            CharsetMatch match = matches[i];
            
            menuItem = new JMenuItem(encodingName(match) + " " + match.getConfidence());
            
            menu.add(menuItem);
        }
    }

View Full Code Here

    public void TestInputFilter() throws Exception
    {
        String s = "<a> <lot> <of> <English> <inside> <the> <markup> Un tr\u00E8s petit peu de Fran\u00E7ais. <to> <confuse> <the> <detector>";
        byte[] bytes = s.getBytes("ISO-8859-1");
        CharsetDetector det = new CharsetDetector();
        CharsetMatch m;
        
        det.enableInputFilter(true);
        if (!det.inputFilterEnabled()){
            errln("input filter should be enabled");
        }
        
        det.setText(bytes);
        m = det.detect();
        
        if (! m.getLanguage().equals("fr")) {
            errln("input filter did not strip markup!");
        }
        
        det.enableInputFilter(false);
        det.setText(bytes);
        m = det.detect();
        
        if (! m.getLanguage().equals("en")) {
            errln("unfiltered input did not detect as English!");
        }
    }

View Full Code Here

                "\u0627\u0644\u062d\u0627\u0633\u0648\u0628 \u002b\u0020\u0627\u0646\u062a\u0631\u0646\u064a\u062a";
        
        byte[] beBytes = source.getBytes("UnicodeBig");
        byte[] leBytes = source.getBytes("UnicodeLittle");
        CharsetDetector det = new CharsetDetector();
        CharsetMatch m;
        
        det.setText(beBytes);
        m = det.detect();
        
        if (! m.getName().equals("UTF-16BE")) {
            errln("Encoding detection failure: expected UTF-16BE, got " + m.getName());
        }
        
        det.setText(leBytes);
        m = det.detect();
        
        if (! m.getName().equals("UTF-16LE")) {
            errln("Encoding detection failure: expected UTF-16LE, got " + m.getName());
        }


        // Jitterbug 4451, for coverage
        int confidence = m.getConfidence(); 
        if(confidence != 100){
            errln("Did not get the expected confidence level " + confidence);
        }
        int matchType = m.getMatchType();
        if(matchType != 0){
            errln("Did not get the expected matchType level " + matchType);
        }
    }

View Full Code Here


        byte[] bISO     = sISO.getBytes("ISO-8859-1");
        byte[] bWindows = sWindows.getBytes("windows-1252");
        
        CharsetDetector det = new CharsetDetector();
        CharsetMatch m;
        
        det.setText(bWindows);
        m = det.detect();
        
        if (m.getName() != "windows-1252") {
            errln("Text with C1 bytes not correctly detected as windows-1252.");
            return;
        }
        
        det.setText(bISO);
        m = det.detect();
        
        if (m.getName() != "ISO-8859-1") {
            errln("Text without C1 bytes not correctly detected as ISO-8859-1.");
        }
    }

View Full Code Here

0 1 2 3

TOP

Related Classes of com.ibm.icu.text.CharsetMatch

com.ibm.icu.dev.demo.charsetdet.DetectingViewer

com.ibm.icu.dev.test.charsetdet.TestCharsetDetector

net.sf.jmatchparser.util.charset.icu4jchardet.ICU4JChardetCharset$Decoder

net.vidageek.crawler.component.WebDownloader

nu.validator.htmlparser.extra.IcuDetectorSniffer

org.apache.marmotta.platform.core.services.importer.ImportWatchServiceImpl

org.apache.maven.doxia.DefaultConverter

org.apache.shindig.gadgets.encoding.EncodingDetector

org.apache.shindig.gadgets.encoding.EncodingDetector$FallbackEncodingDetector

org.apache.stanbol.enhancer.engines.htmlextractor.impl.CharsetRecognizer

All source code are property of their respective owners. Java is a trademark of Sun Microsystems, Inc and owned by ORACLE Inc. Contact coftware#gmail.com.