Examples of CharsetDetector


Examples of com.ibm.icu.text.CharsetDetector

        }
    }
   
    public void TestConstruction() {
        int i;
        CharsetDetector  det = new CharsetDetector();
        if(det==null){
            errln("Could not construct a charset detector");
        }
        String [] charsetNames = CharsetDetector.getAllDetectableCharsets();
        CheckAssert(charsetNames.length != 0);
View Full Code Here

Examples of com.ibm.icu.text.CharsetDetector

    public void TestInputFilter() throws Exception
    {
        String s = "<a> <lot> <of> <English> <inside> <the> <markup> Un tr\u00E8s petit peu de Fran\u00E7ais. <to> <confuse> <the> <detector>";
        byte[] bytes = s.getBytes("ISO-8859-1");
        CharsetDetector det = new CharsetDetector();
        CharsetMatch m;
       
        det.enableInputFilter(true);
        if (!det.inputFilterEnabled()){
            errln("input filter should be enabled");
        }
       
        det.setText(bytes);
        m = det.detect();
       
        if (! m.getLanguage().equals("fr")) {
            errln("input filter did not strip markup!");
        }
       
        det.enableInputFilter(false);
        det.setText(bytes);
        m = det.detect();
       
        if (! m.getLanguage().equals("en")) {
            errln("unfiltered input did not detect as English!");
        }
    }
View Full Code Here

Examples of com.ibm.icu.text.CharsetDetector

        String  s = "This is a string with some non-ascii characters that will " +
                    "be converted to UTF-8, then shoved through the detection process.  " +
                    "\u0391\u0392\u0393\u0394\u0395" +
                    "Sure would be nice if our source could contain Unicode directly!";
        byte [] bytes = s.getBytes("UTF-8");
        CharsetDetector det = new CharsetDetector();
        String retrievedS;
        Reader reader;
       
        retrievedS = det.getString(bytes, "UTF-8");
        CheckAssert(s.equals(retrievedS));
       
        reader = det.getReader(new ByteArrayInputStream(bytes), "UTF-8");
        CheckAssert(s.equals(stringFromReader(reader)));
        det.setDeclaredEncoding("UTF-8"); // Jitterbug 4451, for coverage
    }
View Full Code Here

Examples of com.ibm.icu.text.CharsetDetector

                "u0623\u0648\u0631\u0648\u0628\u0627, \u0628\u0631\u0645\u062c\u064a\u0627\u062a " +
                "\u0627\u0644\u062d\u0627\u0633\u0648\u0628 \u002b\u0020\u0627\u0646\u062a\u0631\u0646\u064a\u062a";
       
        byte[] beBytes = source.getBytes("UnicodeBig");
        byte[] leBytes = source.getBytes("UnicodeLittle");
        CharsetDetector det = new CharsetDetector();
        CharsetMatch m;
       
        det.setText(beBytes);
        m = det.detect();
       
        if (! m.getName().equals("UTF-16BE")) {
            errln("Encoding detection failure: expected UTF-16BE, got " + m.getName());
        }
       
        det.setText(leBytes);
        m = det.detect();
       
        if (! m.getName().equals("UTF-16LE")) {
            errln("Encoding detection failure: expected UTF-16LE, got " + m.getName());
        }
View Full Code Here

Examples of com.ibm.icu.text.CharsetDetector

            "This is another small sample of some English text. Just enough to be sure that it detects correctly. It also includes some \u201CC1\u201D bytes.";

        byte[] bISO     = sISO.getBytes("ISO-8859-1");
        byte[] bWindows = sWindows.getBytes("windows-1252");
       
        CharsetDetector det = new CharsetDetector();
        CharsetMatch m;
       
        det.setText(bWindows);
        m = det.detect();
       
        if (m.getName() != "windows-1252") {
            errln("Text with C1 bytes not correctly detected as windows-1252.");
            return;
        }
       
        det.setText(bISO);
        m = det.detect();
       
        if (m.getName() != "ISO-8859-1") {
            errln("Text without C1 bytes not correctly detected as ISO-8859-1.");
        }
    }
View Full Code Here

Examples of com.ibm.icu.text.CharsetDetector

                {(byte)'A', (byte)'B'},
                {(byte)'A', (byte)'B', (byte)'C'},
                {(byte)'A', (byte)'B', (byte)'C', (byte)'D'}
            };
       
        CharsetDetector det = new CharsetDetector();
        CharsetMatch m;
        for (int i=0; i<shortBytes.length; i++) {
            det.setText(shortBytes[i]);
            m = det.detect();
            logln("i=" + i + " -> " + m.getName());
        }
    }
View Full Code Here

Examples of com.ibm.icu.text.CharsetDetector

            null,
            null,
            "ISO-8859-1"
        };
       
        CharsetDetector det = new CharsetDetector();
        CharsetMatch match;

        det.setDeclaredEncoding("ISO-2022-JP");

        for (int idx = 0; idx < testStrings.length; idx += 1) {
            det.setText(testStrings[idx]);
            match = det.detect();

            if (match == null) {
                if (testResults[idx] != null) {
                    errln("Unexpectedly got no results at index " + idx);
                }
View Full Code Here

Examples of com.ibm.icu.text.CharsetDetector

        if (split.length > 1) {
            lang = split[1];
        }

        try {
            CharsetDetector det = new CharsetDetector();
            byte[] bytes;
           
            //if (enc.startsWith("UTF-32")) {
            //    UTF32 utf32 = UTF32.getInstance(enc);
               
            //    bytes = utf32.toBytes(testString);
            //} else {
                String from = enc;

                while (true) {
                    try {
                        bytes = testString.getBytes(from);
                    } catch (UnsupportedOperationException uoe) {
                         // In some runtimes, the ISO-2022-CN converter
                         // only converts *to* Unicode - we have to use
                         // x-ISO-2022-CN-GB to convert *from* Unicode.
                        if (from.equals("ISO-2022-CN")) {
                            from = "x-ISO-2022-CN-GB";
                            continue;
                        }
                       
                        // Ignore any other converters that can't
                        // convert from Unicode.
                        return;
                    } catch (UnsupportedEncodingException uee) {
                        // Ignore any encodings that this runtime
                        // doesn't support.
                        return;
                    }
                   
                    break;
                }
            //}
       
            det.setText(bytes);
            checkMatch(det, testString, enc, lang, id);
           
            det.setText(new ByteArrayInputStream(bytes));
            checkMatch(det, testString, enc, lang, id);
         } catch (Exception e) {
            errln(id + ": " + e.toString() + "enc=" + enc);
            e.printStackTrace();
        }
View Full Code Here

Examples of com.ibm.icu.text.CharsetDetector

        CheckAssert(charsetMatch.equals("IBM420_ltr"));

    }

    private CharsetMatch _testIBM420_ar_rtl(String s, CharsetEncoder encoder) throws Exception {
        CharsetDetector det = new CharsetDetector();
        det.setText(encoder.encode(CharBuffer.wrap(s)).array());
        CharsetMatch m = det.detect();
        return m;
    }
View Full Code Here

Examples of com.ibm.icu.text.CharsetDetector

         */   
       
        StringBuffer ltrStrBuf = new StringBuffer(s);
        ltrStrBuf = ltrStrBuf.reverse();
       
        CharsetDetector det = new CharsetDetector();
        det.setText(encoder.encode(CharBuffer.wrap(ltrStrBuf.toString())).array());
        CharsetMatch m = det.detect();
        return m;
    }
View Full Code Here
TOP
Copyright © 2018 www.massapi.com. All rights reserved.
All source code are property of their respective owners. Java is a trademark of Sun Microsystems, Inc and owned by ORACLE Inc. Contact coftware#gmail.com.