Package com.ibm.icu.text

Examples of com.ibm.icu.text.CharsetMatch


  }

  public String autoDetectEncoding(byte[] bytes) {
    CharsetDetector cd = new CharsetDetector();
    cd.setText(bytes);
    CharsetMatch charsetMatch = cd.detect();
    String charSet = charsetMatch.getName();

    int confidence = charsetMatch.getConfidence();
    logger.info("CharsetMatch: {} ({}% confidence)", charSet, confidence);
    setSelectedItem(charSet);
    return charSet;
  }
View Full Code Here


            stream = new BufferedInputStream(stream);
        }
   
        detector.setText(stream);
   
        CharsetMatch match = detector.detect();
        if (match == null) {
            throw new TikaException("Unable to detect character encoding");
        }
       
        metadata.set(Metadata.CONTENT_ENCODING, match.getName());
        String language = match.getLanguage();
        if (language != null) {
            metadata.set(Metadata.CONTENT_LANGUAGE, match.getLanguage());
            metadata.set(Metadata.LANGUAGE, match.getLanguage());
        }
       
        return match.getReader();
    }
View Full Code Here

            }

            is = new BufferedInputStream( new FileInputStream( f ) );
            CharsetDetector detector = new CharsetDetector();
            detector.setText( is );
            CharsetMatch match = detector.detect();

            return match.getName().toUpperCase( Locale.ENGLISH );
        }
        catch ( IOException e )
        {
            // nop
        }
View Full Code Here

    }

    // Fall back to the incredibly slow ICU. It might be better to just skip this entirely.
    CharsetDetector detector = new CharsetDetector();
    detector.setText(input);
    CharsetMatch match = detector.detect();
    return Charset.forName(match.getName().toUpperCase());
  }
View Full Code Here

  public static class FallbackEncodingDetector {
    public Charset detectEncoding(byte[] input) {
      // Fall back to the incredibly slow ICU. It might be better to just skip this entirely.
      CharsetDetector detector = new CharsetDetector();
      detector.setText(input);
      CharsetMatch match = detector.detect();
      return Charset.forName(match.getName().toUpperCase());
    }
View Full Code Here

        }

        if (Status.OK.equals(status)) {
          CharsetDetector detector = new CharsetDetector();
          detector.setText(read(response.getEntity().getContent()));
          CharsetMatch match = detector.detect();

          log.debug("Detected charset: " + match.getName());

          String content = match.getString();
          CharBuffer buffer = CharBuffer.wrap(content.toCharArray());
          Charset utf8Charset = Charset.forName("UTF-8");
          String utf8Content = new String(utf8Charset.encode(buffer).array(), "UTF-8");

          return new OkPage(url, utf8Content);
View Full Code Here

        JMenuItem menuItem;
       
        menu.removeAll();
       
        for (int i = 0; i < matches.length; i += 1) {
            CharsetMatch match = matches[i];
           
            menuItem = new JMenuItem(encodingName(match) + " " + match.getConfidence());
           
            menu.add(menuItem);
        }
    }
View Full Code Here

    public void TestInputFilter() throws Exception
    {
        String s = "<a> <lot> <of> <English> <inside> <the> <markup> Un tr\u00E8s petit peu de Fran\u00E7ais. <to> <confuse> <the> <detector>";
        byte[] bytes = s.getBytes("ISO-8859-1");
        CharsetDetector det = new CharsetDetector();
        CharsetMatch m;
       
        det.enableInputFilter(true);
        if (!det.inputFilterEnabled()){
            errln("input filter should be enabled");
        }
       
        det.setText(bytes);
        m = det.detect();
       
        if (! m.getLanguage().equals("fr")) {
            errln("input filter did not strip markup!");
        }
       
        det.enableInputFilter(false);
        det.setText(bytes);
        m = det.detect();
       
        if (! m.getLanguage().equals("en")) {
            errln("unfiltered input did not detect as English!");
        }
    }
View Full Code Here

                "\u0627\u0644\u062d\u0627\u0633\u0648\u0628 \u002b\u0020\u0627\u0646\u062a\u0631\u0646\u064a\u062a";
       
        byte[] beBytes = source.getBytes("UnicodeBig");
        byte[] leBytes = source.getBytes("UnicodeLittle");
        CharsetDetector det = new CharsetDetector();
        CharsetMatch m;
       
        det.setText(beBytes);
        m = det.detect();
       
        if (! m.getName().equals("UTF-16BE")) {
            errln("Encoding detection failure: expected UTF-16BE, got " + m.getName());
        }
       
        det.setText(leBytes);
        m = det.detect();
       
        if (! m.getName().equals("UTF-16LE")) {
            errln("Encoding detection failure: expected UTF-16LE, got " + m.getName());
        }

        // Jitterbug 4451, for coverage
        int confidence = m.getConfidence();
        if(confidence != 100){
            errln("Did not get the expected confidence level " + confidence);
        }
        int matchType = m.getMatchType();
        if(matchType != 0){
            errln("Did not get the expected matchType level " + matchType);
        }
    }
View Full Code Here

        byte[] bISO     = sISO.getBytes("ISO-8859-1");
        byte[] bWindows = sWindows.getBytes("windows-1252");
       
        CharsetDetector det = new CharsetDetector();
        CharsetMatch m;
       
        det.setText(bWindows);
        m = det.detect();
       
        if (m.getName() != "windows-1252") {
            errln("Text with C1 bytes not correctly detected as windows-1252.");
            return;
        }
       
        det.setText(bISO);
        m = det.detect();
       
        if (m.getName() != "ISO-8859-1") {
            errln("Text without C1 bytes not correctly detected as ISO-8859-1.");
        }
    }
View Full Code Here

TOP

Related Classes of com.ibm.icu.text.CharsetMatch

Copyright © 2018 www.massapicom. All rights reserved.
All source code are property of their respective owners. Java is a trademark of Sun Microsystems, Inc and owned by ORACLE Inc. Contact coftware#gmail.com.