Package org.apache.stanbol.enhancer.engines.htmlextractor.impl

Source Code of org.apache.stanbol.enhancer.engines.htmlextractor.impl.CharsetRecognizer

/*
* Licensed to the Apache Software Foundation (ASF) under one or more
* contributor license agreements.  See the NOTICE file distributed with
* this work for additional information regarding copyright ownership.
* The ASF licenses this file to You under the Apache License, Version 2.0
* (the "License"); you may not use this file except in compliance with
* the License.  You may obtain a copy of the License at
*
*     http://www.apache.org/licenses/LICENSE-2.0
*
* Unless required by applicable law or agreed to in writing, software
* distributed under the License is distributed on an "AS IS" BASIS,
* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
* See the License for the specific language governing permissions and
* limitations under the License.
*/
package org.apache.stanbol.enhancer.engines.htmlextractor.impl;

import java.io.BufferedInputStream;
import java.io.FileInputStream;
import java.io.IOException;
import java.io.InputStream;
import java.util.regex.Matcher;
import java.util.regex.Pattern;

import org.slf4j.Logger;
import org.slf4j.LoggerFactory;

import com.ibm.icu.text.CharsetDetector;
import com.ibm.icu.text.CharsetMatch;

/**
* EncodingDetector.java
*
* @author <a href="mailto:kasper@dfki.de">Walter Kasper</a>
*
*/
public class CharsetRecognizer {

    /**
     * This contains the logger.
     */
    private static final Logger LOG =
        LoggerFactory.getLogger(CharsetRecognizer.class);


    private static String checkPattern(String str, String pattern, int group) {

        Pattern pat = Pattern.compile(pattern);
        Matcher m = pat.matcher(str);
        if (m.find()) {
            return m.group(group);
        }
        return null;
    }


    private static String checkFormat(String format, InputStream in)
            throws IOException {

        String result = null;
        String defaultValue = null;
        byte[] bytes;
        String decl;
        in.mark(4096);
        int read;
        if (format.equalsIgnoreCase("xml")) {
            defaultValue = "UTF-8";
            bytes = new byte[80];
            read = in.read(bytes);
            in.reset();
            decl = new String(bytes, 0, read, "US-ASCII");
            result = checkPattern(decl, "encoding=\"(\\w[-\\w]+)\"", 1);
        }
        else if (format.equalsIgnoreCase("html")) {
            bytes = new byte[2048];
            read = in.read(bytes);
            in.reset();
            decl = new String(bytes, 0, read, "US-ASCII");
            result =
                checkPattern(decl,
                "<meta .*?content=\".*charset=(\\w[-\\w]+).*?/>", 1);
        }
        if (result != null) {
            result = result.toUpperCase();
            LOG.debug(format.toUpperCase() + " encoding: " + result);
        }
        else {
            return defaultValue;
        }
        return result;
    }


    public static String detect(InputStream in)
            throws IOException {

        return detect(in, null, null);
    }


    public static String detect(InputStream in, String format, String encoding)
            throws IOException {

        // the input stream must support marks
        if (!in.markSupported()) {
            throw new IOException("Mark not supported by input stream");
        }
        String result = null;
        if (format != null) {
            result = checkFormat(format, in);
            if (result != null) {
                return result;
            }
        }
        // in case of HTML or XML check whether there is a charset
        // specification; might be too fragile
        CharsetDetector detector = new CharsetDetector();
        if (encoding != null) {
            detector.setDeclaredEncoding(encoding);
        }
        detector.setText(in);
        CharsetMatch found = detector.detect();
        result = found.getName();
        LOG.debug("Encoding: " + result);
        return result;
    }

    public static void main(String[] args) {

        String format = null;
        String encoding = null;
        int argv = 0;
        while (argv < args.length && args[argv].startsWith("-")) {
            String option = args[argv].substring(1);
            if (option.startsWith("f")) {
                format = args[++argv];
            }
            else if (option.startsWith("e")) {
                encoding = args[++argv];
            }
            else {
                System.err.println("illegal option: " + option);
                System.exit(1);
            }
            ++argv;
        }
        for (int i = argv; i < args.length; ++i) {
            try {
                BufferedInputStream fstream =
                    new BufferedInputStream(new FileInputStream(args[i]));
                String found =
                    CharsetRecognizer.detect(fstream, format, encoding);
                System.out.println("Encoding: " + found + ": " + args[i]);
                /*
                 * check whether the stream is reset correctly byte[] bytes =
                 * new byte[50]; int read = fstream.read(bytes);
                 * System.out.println(new String(bytes));
                 */
                fstream.close();
            } catch (IOException e) {
                LOG.error(e.getMessage());
            }
        }
    }

}
TOP

Related Classes of org.apache.stanbol.enhancer.engines.htmlextractor.impl.CharsetRecognizer

TOP
Copyright © 2018 www.massapi.com. All rights reserved.
All source code are property of their respective owners. Java is a trademark of Sun Microsystems, Inc and owned by ORACLE Inc. Contact coftware#gmail.com.