Package com.livingsocial.hive.udf

Source Code of com.livingsocial.hive.udf.Tokenize

package com.livingsocial.hive.udf;

import java.io.IOException;
import java.io.Reader;
import java.io.StringReader;
import java.util.ArrayList;

import org.apache.hadoop.hive.ql.exec.Description;
import org.apache.hadoop.hive.ql.exec.UDF;
import org.apache.hadoop.hive.ql.metadata.HiveException;
import org.apache.hadoop.io.Text;
import org.apache.lucene.analysis.Analyzer;
import org.apache.lucene.analysis.TokenStream;
import org.apache.lucene.analysis.Tokenizer;
import org.apache.lucene.analysis.charfilter.HTMLStripCharFilter;
import org.apache.lucene.analysis.core.LowerCaseFilter;
import org.apache.lucene.analysis.core.StopFilter;
import org.apache.lucene.analysis.en.EnglishPossessiveFilter;
import org.apache.lucene.analysis.en.KStemFilter;
import org.apache.lucene.analysis.en.PorterStemFilter;
import org.apache.lucene.analysis.miscellaneous.ASCIIFoldingFilter;
import org.apache.lucene.analysis.standard.StandardAnalyzer;
import org.apache.lucene.analysis.standard.StandardFilter;
import org.apache.lucene.analysis.standard.StandardTokenizer;
import org.apache.lucene.analysis.tokenattributes.CharTermAttribute;
import org.apache.lucene.analysis.util.CharArraySet;
import org.apache.lucene.util.Version;

/**
* Tokenize: splits a natural language chunk of text into an array of stemmed
* lowercase words. English stop words are excluded from the output.
*
*/
@Description(name = "tokenize", value = "_FUNC_(str) - Splits str"
    + " into an arrays of stemmed words")
public class Tokenize extends UDF {

  public ArrayList<Text> evaluate(Text text) throws HiveException {
    ArrayList<Text> result = new ArrayList<Text>();
    Analyzer analyzer = new MyAnalyzer();
    try {
      TokenStream stream = analyzer.tokenStream("",
          new StringReader(text.toString()));
      stream.reset();
      while (stream.incrementToken()) {
        CharTermAttribute term = stream
            .getAttribute(CharTermAttribute.class);
        result.add(new Text(term.toString()));
      }
    } catch (IOException e) {
      throw new HiveException(e);
    } finally {
      analyzer.close();
    }
    return result;
  }

  private static class DefaultSetHolder {
    static final CharArraySet DEFAULT_STOP_SET = StandardAnalyzer.STOP_WORDS_SET;
  }

  /**
   * Customer Analyzer based on {@link StandardAnalyzer} except using
   * {@link KStemFilter} instead of the more aggressive
   * {@link PorterStemFilter}. I also added in the {@link ASCIIFoldingFilter}
   * in order to remove accents from words, and {@link HTMLStripCharFilter}
   * to strip out HTML elements.
   */
  private static class MyAnalyzer extends Analyzer {

    @Override
    protected TokenStreamComponents createComponents(String fieldName,
        Reader reader) {
      Version matchVersion = Version.LUCENE_45;
      final Tokenizer source = new StandardTokenizer(matchVersion, reader);
      TokenStream result = new StandardFilter(matchVersion, source);
      result = new EnglishPossessiveFilter(matchVersion, result);
      result = new LowerCaseFilter(matchVersion, result);
      result = new StopFilter(matchVersion, result,
          DefaultSetHolder.DEFAULT_STOP_SET);
      result = new ASCIIFoldingFilter(result);
      result = new KStemFilter(result);
      return new TokenStreamComponents(source, result);
    }

    @Override
    protected Reader initReader(String fieldName, Reader reader) {
      return new HTMLStripCharFilter(reader);
    }
  }
}
TOP

Related Classes of com.livingsocial.hive.udf.Tokenize

TOP
Copyright © 2018 www.massapi.com. All rights reserved.
All source code are property of their respective owners. Java is a trademark of Sun Microsystems, Inc and owned by ORACLE Inc. Contact coftware#gmail.com.