Package org.apache.nutch.analysis.lang

Source Code of org.apache.nutch.analysis.lang.HTMLLanguageParser$LanguageParser

/**
* Licensed to the Apache Software Foundation (ASF) under one or more
* contributor license agreements.  See the NOTICE file distributed with
* this work for additional information regarding copyright ownership.
* The ASF licenses this file to You under the Apache License, Version 2.0
* (the "License"); you may not use this file except in compliance with
* the License.  You may obtain a copy of the License at
*
*     http://www.apache.org/licenses/LICENSE-2.0
*
* Unless required by applicable law or agreed to in writing, software
* distributed under the License is distributed on an "AS IS" BASIS,
* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
* See the License for the specific language governing permissions and
* limitations under the License.
*/
package org.apache.nutch.analysis.lang;

// JDK imports
import java.util.Enumeration;
import java.util.HashMap;
import java.util.Map;
import java.util.Properties;

import org.apache.hadoop.conf.Configuration;
import org.apache.nutch.metadata.Metadata;
import org.apache.nutch.net.protocols.Response;
import org.apache.nutch.parse.HTMLMetaTags;
import org.apache.nutch.parse.HtmlParseFilter;
import org.apache.nutch.parse.Parse;
import org.apache.nutch.parse.ParseResult;
import org.apache.nutch.protocol.Content;
import org.apache.nutch.util.NodeWalker;
import org.apache.tika.language.LanguageIdentifier;
import org.slf4j.Logger;
import org.slf4j.LoggerFactory;
import org.w3c.dom.DocumentFragment;
import org.w3c.dom.Element;
import org.w3c.dom.NamedNodeMap;
import org.w3c.dom.Node;

public class HTMLLanguageParser implements HtmlParseFilter {

    public static final Logger LOG = LoggerFactory
            .getLogger(HTMLLanguageParser.class);

    private int detect = -1, identify = -1;

    private int contentMaxlength = -1;

    private boolean onlyCertain = false;

    /* A static Map of ISO-639 language codes */
    private static Map<String, String> LANGUAGES_MAP = new HashMap<String, String>();
    static {
        try {
            Properties p = new Properties();
            p.load(HTMLLanguageParser.class
                    .getResourceAsStream("langmappings.properties"));
            Enumeration<?> keys = p.keys();
            while (keys.hasMoreElements()) {
                String key = (String) keys.nextElement();
                String[] values = p.getProperty(key).split(",", -1);
                LANGUAGES_MAP.put(key, key);
                for (int i = 0; i < values.length; i++) {
                    LANGUAGES_MAP.put(values[i].trim().toLowerCase(), key);
                }
            }
        } catch (Exception e) {
            if (LOG.isErrorEnabled()) {
                LOG.error(e.toString());
            }
        }
    }

    private Configuration conf;

    /**
     * Scan the HTML document looking at possible indications of content
     * language<br>
     * <li>1. html lang attribute
     * (http://www.w3.org/TR/REC-html40/struct/dirlang.html#h-8.1) <li>2. meta
     * dc.language
     * (http://dublincore.org/documents/2000/07/16/usageguide/qualified
     * -html.shtml#language) <li>3. meta http-equiv (content-language)
     * (http://www.w3.org/TR/REC-html40/struct/global.html#h-7.4.4.2) <br>
     */
    public ParseResult filter(Content content, ParseResult parseResult,
            HTMLMetaTags metaTags, DocumentFragment doc) {
        String lang = null;

        Parse parse = parseResult.get(content.getUrl());

        if (detect >= 0 && identify < 0) {
            lang = detectLanguage(parse, doc);
        } else if (detect < 0 && identify >= 0) {
            lang = identifyLanguage(parse);
        } else if (detect < identify) {
            lang = detectLanguage(parse, doc);
            if (lang == null) {
                lang = identifyLanguage(parse);
            }
        } else if (identify < detect) {
            lang = identifyLanguage(parse);
            if (lang == null) {
                lang = detectLanguage(parse, doc);
            }
        } else {
            LOG.warn("No configuration for language extraction policy is provided");
            return parseResult;
        }

        if (lang != null) {
            parse.getData().getParseMeta().set(Metadata.LANGUAGE, lang);
            return parseResult;
        }

        return parseResult;
    }

    /** Try to find the document's language from page headers and metadata */
    private String detectLanguage(Parse page, DocumentFragment doc) {
        String lang = getLanguageFromMetadata(page.getData().getParseMeta());
        if (lang == null) {
            LanguageParser parser = new LanguageParser(doc);
            lang = parser.getLanguage();
        }

        if (lang != null) {
            return lang;
        }

        lang = page.getData().getContentMeta().get(Response.CONTENT_LANGUAGE);

        return lang;
    }

    /** Use statistical language identification to extract page language */
    private String identifyLanguage(Parse parse) {
        StringBuilder text = new StringBuilder();
        if (parse == null)
            return null;

        String title = parse.getData().getTitle();
        if (title != null) {
            text.append(title.toString());
        }

        String content = parse.getText();
        if (content != null) {
            text.append(" ").append(content.toString());
        }

        // trim content?
        String titleandcontent = text.toString();

        if (this.contentMaxlength != -1
                && titleandcontent.length() > this.contentMaxlength)
            titleandcontent = titleandcontent.substring(0, contentMaxlength);

        LanguageIdentifier identifier = new LanguageIdentifier(titleandcontent);

        if (onlyCertain) {
            if (identifier.isReasonablyCertain())
                return identifier.getLanguage();
            else
                return null;
        }
        return identifier.getLanguage();
    }

    // Check in the metadata whether the language has already been stored there
    // by Tika
    private static String getLanguageFromMetadata(Metadata meta) {
        if (meta == null)
            return null;
        // dublin core
        String lang = meta.get("dc.language");
        if (lang != null)
            return lang;
        // meta content-language
        lang = meta.get("content-language");
        if (lang != null)
            return lang;
        // lang attribute
        return meta.get("lang");
    }

    static class LanguageParser {

        private String dublinCore = null;
        private String htmlAttribute = null;
        private String httpEquiv = null;
        private String language = null;

        LanguageParser(Node node) {
            parse(node);
            if (htmlAttribute != null) {
                language = htmlAttribute;
            } else if (dublinCore != null) {
                language = dublinCore;
            } else {
                language = httpEquiv;
            }
        }

        String getLanguage() {
            return language;
        }

        void parse(Node node) {

            NodeWalker walker = new NodeWalker(node);
            while (walker.hasNext()) {

                Node currentNode = walker.nextNode();
                String nodeName = currentNode.getNodeName();
                short nodeType = currentNode.getNodeType();

                if (nodeType == Node.ELEMENT_NODE) {

                    // Check for the lang HTML attribute
                    if (htmlAttribute == null) {
                        htmlAttribute = parseLanguage(((Element) currentNode)
                                .getAttribute("lang"));
                    }

                    // Check for Meta
                    if ("meta".equalsIgnoreCase(nodeName)) {
                        NamedNodeMap attrs = currentNode.getAttributes();

                        // Check for the dc.language Meta
                        if (dublinCore == null) {
                            for (int i = 0; i < attrs.getLength(); i++) {
                                Node attrnode = attrs.item(i);
                                if ("name".equalsIgnoreCase(attrnode
                                        .getNodeName())) {
                                    if ("dc.language".equalsIgnoreCase(attrnode
                                            .getNodeValue())) {
                                        Node valueattr = attrs
                                                .getNamedItem("content");
                                        if (valueattr != null) {
                                            dublinCore = parseLanguage(valueattr
                                                    .getNodeValue());
                                        }
                                    }
                                }
                            }
                        }

                        // Check for the http-equiv content-language
                        if (httpEquiv == null) {
                            for (int i = 0; i < attrs.getLength(); i++) {
                                Node attrnode = attrs.item(i);
                                if ("http-equiv".equalsIgnoreCase(attrnode
                                        .getNodeName())) {
                                    if ("content-language".equals(attrnode
                                            .getNodeValue().toLowerCase())) {
                                        Node valueattr = attrs
                                                .getNamedItem("content");
                                        if (valueattr != null) {
                                            httpEquiv = parseLanguage(valueattr
                                                    .getNodeValue());
                                        }
                                    }
                                }
                            }
                        }
                    }
                }

                if ((dublinCore != null) && (htmlAttribute != null)
                        && (httpEquiv != null)) {
                    return;
                }
            }
        }

        /**
         * Parse a language string and return an ISO 639 primary code, or
         * <code>null</code> if something wrong occurs, or if no language is
         * found.
         */
        final static String parseLanguage(String lang) {

            if (lang == null) {
                return null;
            }

            String code = null;
            String language = null;

            // First, split multi-valued values
            String langs[] = lang.split(",| |;|\\.|\\(|\\)|=", -1);

            int i = 0;
            while ((language == null) && (i < langs.length)) {
                // Then, get the primary code
                code = langs[i].split("-")[0];
                code = code.split("_")[0];
                // Find the ISO 639 code
                language = (String) LANGUAGES_MAP.get(code.toLowerCase());
                i++;
            }

            return language;
        }

    }

    public void setConf(Configuration conf) {
        this.conf = conf;
        contentMaxlength = conf.getInt("lang.analyze.max.length", -1);
        onlyCertain = conf.getBoolean("lang.identification.only.certain", false);
        String[] policy = conf.getStrings("lang.extraction.policy");
        for (int i = 0; i < policy.length; i++) {
            if (policy[i].equals("detect")) {
                detect = i;
            } else if (policy[i].equals("identify")) {
                identify = i;
            }
        }
    }

    public Configuration getConf() {
        return this.conf;
    }

}
TOP

Related Classes of org.apache.nutch.analysis.lang.HTMLLanguageParser$LanguageParser

TOP
Copyright © 2018 www.massapi.com. All rights reserved.
All source code are property of their respective owners. Java is a trademark of Sun Microsystems, Inc and owned by ORACLE Inc. Contact coftware#gmail.com.