Source Code of org.apache.nutch.analysis.lang.HTMLLanguageParser$LanguageParser

/**
 * Licensed to the Apache Software Foundation (ASF) under one or more
 * contributor license agreements.  See the NOTICE file distributed with
 * this work for additional information regarding copyright ownership.
 * The ASF licenses this file to You under the Apache License, Version 2.0
 * (the "License"); you may not use this file except in compliance with
 * the License.  You may obtain a copy of the License at
 *
 *     http://www.apache.org/licenses/LICENSE-2.0
 *
 * Unless required by applicable law or agreed to in writing, software
 * distributed under the License is distributed on an "AS IS" BASIS,
 * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 * See the License for the specific language governing permissions and
 * limitations under the License.
 */
package org.apache.nutch.analysis.lang;


// JDK imports
import java.util.Enumeration;
import java.util.HashMap;
import java.util.Map;
import java.util.Properties;


import org.apache.hadoop.conf.Configuration;
import org.apache.nutch.metadata.Metadata;
import org.apache.nutch.net.protocols.Response;
import org.apache.nutch.parse.HTMLMetaTags;
import org.apache.nutch.parse.HtmlParseFilter;
import org.apache.nutch.parse.Parse;
import org.apache.nutch.parse.ParseResult;
import org.apache.nutch.protocol.Content;
import org.apache.nutch.util.NodeWalker;
import org.apache.tika.language.LanguageIdentifier;
import org.slf4j.Logger;
import org.slf4j.LoggerFactory;
import org.w3c.dom.DocumentFragment;
import org.w3c.dom.Element;
import org.w3c.dom.NamedNodeMap;
import org.w3c.dom.Node;


public class HTMLLanguageParser implements HtmlParseFilter {


    public static final Logger LOG = LoggerFactory
            .getLogger(HTMLLanguageParser.class);


    private int detect = -1, identify = -1;


    private int contentMaxlength = -1;


    private boolean onlyCertain = false;


    /* A static Map of ISO-639 language codes */
    private static Map<String, String> LANGUAGES_MAP = new HashMap<String, String>();
    static {
        try {
            Properties p = new Properties();
            p.load(HTMLLanguageParser.class
                    .getResourceAsStream("langmappings.properties"));
            Enumeration<?> keys = p.keys();
            while (keys.hasMoreElements()) {
                String key = (String) keys.nextElement();
                String[] values = p.getProperty(key).split(",", -1);
                LANGUAGES_MAP.put(key, key);
                for (int i = 0; i < values.length; i++) {
                    LANGUAGES_MAP.put(values[i].trim().toLowerCase(), key);
                }
            }
        } catch (Exception e) {
            if (LOG.isErrorEnabled()) {
                LOG.error(e.toString());
            }
        }
    }


    private Configuration conf;


    /**
     * Scan the HTML document looking at possible indications of content
     * language<br>
     * <li>1. html lang attribute
     * (http://www.w3.org/TR/REC-html40/struct/dirlang.html#h-8.1) <li>2. meta
     * dc.language
     * (http://dublincore.org/documents/2000/07/16/usageguide/qualified
     * -html.shtml#language) <li>3. meta http-equiv (content-language)
     * (http://www.w3.org/TR/REC-html40/struct/global.html#h-7.4.4.2) <br>
     */
    public ParseResult filter(Content content, ParseResult parseResult,
            HTMLMetaTags metaTags, DocumentFragment doc) {
        String lang = null;


        Parse parse = parseResult.get(content.getUrl());


        if (detect >= 0 && identify < 0) {
            lang = detectLanguage(parse, doc);
        } else if (detect < 0 && identify >= 0) {
            lang = identifyLanguage(parse);
        } else if (detect < identify) {
            lang = detectLanguage(parse, doc);
            if (lang == null) {
                lang = identifyLanguage(parse);
            }
        } else if (identify < detect) {
            lang = identifyLanguage(parse);
            if (lang == null) {
                lang = detectLanguage(parse, doc);
            }
        } else {
            LOG.warn("No configuration for language extraction policy is provided");
            return parseResult;
        }


        if (lang != null) {
            parse.getData().getParseMeta().set(Metadata.LANGUAGE, lang);
            return parseResult;
        }


        return parseResult;
    }


    /** Try to find the document's language from page headers and metadata */
    private String detectLanguage(Parse page, DocumentFragment doc) {
        String lang = getLanguageFromMetadata(page.getData().getParseMeta());
        if (lang == null) {
            LanguageParser parser = new LanguageParser(doc);
            lang = parser.getLanguage();
        }


        if (lang != null) {
            return lang;
        }


        lang = page.getData().getContentMeta().get(Response.CONTENT_LANGUAGE);


        return lang;
    }


    /** Use statistical language identification to extract page language */
    private String identifyLanguage(Parse parse) {
        StringBuilder text = new StringBuilder();
        if (parse == null)
            return null;


        String title = parse.getData().getTitle();
        if (title != null) {
            text.append(title.toString());
        }


        String content = parse.getText();
        if (content != null) {
            text.append(" ").append(content.toString());
        }


        // trim content?
        String titleandcontent = text.toString();


        if (this.contentMaxlength != -1
                && titleandcontent.length() > this.contentMaxlength)
            titleandcontent = titleandcontent.substring(0, contentMaxlength);


        LanguageIdentifier identifier = new LanguageIdentifier(titleandcontent);


        if (onlyCertain) {
            if (identifier.isReasonablyCertain())
                return identifier.getLanguage();
            else
                return null;
        }
        return identifier.getLanguage();
    }


    // Check in the metadata whether the language has already been stored there
    // by Tika
    private static String getLanguageFromMetadata(Metadata meta) {
        if (meta == null)
            return null;
        // dublin core
        String lang = meta.get("dc.language");
        if (lang != null)
            return lang;
        // meta content-language
        lang = meta.get("content-language");
        if (lang != null)
            return lang;
        // lang attribute
        return meta.get("lang");
    }


    static class LanguageParser {


        private String dublinCore = null;
        private String htmlAttribute = null;
        private String httpEquiv = null;
        private String language = null;


        LanguageParser(Node node) {
            parse(node);
            if (htmlAttribute != null) {
                language = htmlAttribute;
            } else if (dublinCore != null) {
                language = dublinCore;
            } else {
                language = httpEquiv;
            }
        }


        String getLanguage() {
            return language;
        }


        void parse(Node node) {


            NodeWalker walker = new NodeWalker(node);
            while (walker.hasNext()) {


                Node currentNode = walker.nextNode();
                String nodeName = currentNode.getNodeName();
                short nodeType = currentNode.getNodeType();


                if (nodeType == Node.ELEMENT_NODE) {


                    // Check for the lang HTML attribute
                    if (htmlAttribute == null) {
                        htmlAttribute = parseLanguage(((Element) currentNode)
                                .getAttribute("lang"));
                    }


                    // Check for Meta
                    if ("meta".equalsIgnoreCase(nodeName)) {
                        NamedNodeMap attrs = currentNode.getAttributes();


                        // Check for the dc.language Meta
                        if (dublinCore == null) {
                            for (int i = 0; i < attrs.getLength(); i++) {
                                Node attrnode = attrs.item(i);
                                if ("name".equalsIgnoreCase(attrnode
                                        .getNodeName())) {
                                    if ("dc.language".equalsIgnoreCase(attrnode
                                            .getNodeValue())) {
                                        Node valueattr = attrs
                                                .getNamedItem("content");
                                        if (valueattr != null) {
                                            dublinCore = parseLanguage(valueattr
                                                    .getNodeValue());
                                        }
                                    }
                                }
                            }
                        }


                        // Check for the http-equiv content-language
                        if (httpEquiv == null) {
                            for (int i = 0; i < attrs.getLength(); i++) {
                                Node attrnode = attrs.item(i);
                                if ("http-equiv".equalsIgnoreCase(attrnode
                                        .getNodeName())) {
                                    if ("content-language".equals(attrnode
                                            .getNodeValue().toLowerCase())) {
                                        Node valueattr = attrs
                                                .getNamedItem("content");
                                        if (valueattr != null) {
                                            httpEquiv = parseLanguage(valueattr
                                                    .getNodeValue());
                                        }
                                    }
                                }
                            }
                        }
                    }
                }


                if ((dublinCore != null) && (htmlAttribute != null)
                        && (httpEquiv != null)) {
                    return;
                }
            }
        }


        /**
         * Parse a language string and return an ISO 639 primary code, or
         * <code>null</code> if something wrong occurs, or if no language is
         * found.
         */
        final static String parseLanguage(String lang) {


            if (lang == null) {
                return null;
            }


            String code = null;
            String language = null;


            // First, split multi-valued values
            String langs[] = lang.split(",| |;|\\.|\\(|\\)|=", -1);


            int i = 0;
            while ((language == null) && (i < langs.length)) {
                // Then, get the primary code
                code = langs[i].split("-")[0];
                code = code.split("_")[0];
                // Find the ISO 639 code
                language = (String) LANGUAGES_MAP.get(code.toLowerCase());
                i++;
            }


            return language;
        }


    }


    public void setConf(Configuration conf) {
        this.conf = conf;
        contentMaxlength = conf.getInt("lang.analyze.max.length", -1);
        onlyCertain = conf.getBoolean("lang.identification.only.certain", false);
        String[] policy = conf.getStrings("lang.extraction.policy");
        for (int i = 0; i < policy.length; i++) {
            if (policy[i].equals("detect")) {
                detect = i;
            } else if (policy[i].equals("identify")) {
                identify = i;
            }
        }
    }


    public Configuration getConf() {
        return this.conf;
    }


}
Source Code of org.apache.nutch.analysis.lang.HTMLLanguageParser$LanguageParser

Related Classes of org.apache.nutch.analysis.lang.HTMLLanguageParser$LanguageParser