Package ivory.core.tokenize

Source Code of ivory.core.tokenize.OpenNLPTokenizer

package ivory.core.tokenize;

import ivory.core.Constants;
import java.io.BufferedReader;
import java.io.FileInputStream;
import java.io.IOException;
import java.io.InputStreamReader;
import java.util.HashSet;
import java.util.Set;
import opennlp.tools.tokenize.Tokenizer;
import opennlp.tools.tokenize.TokenizerME;
import opennlp.tools.tokenize.TokenizerModel;
import org.apache.hadoop.conf.Configuration;
import org.apache.hadoop.fs.FSDataInputStream;
import org.apache.hadoop.fs.FileSystem;
import org.apache.hadoop.fs.Path;
import org.apache.log4j.Level;
import org.apache.log4j.Logger;
import org.mortbay.log.Log;
import org.tartarus.snowball.SnowballStemmer;
import edu.umd.hooka.VocabularyWritable;
import edu.umd.hooka.alignment.HadoopAlign;

public class OpenNLPTokenizer extends ivory.core.tokenize.Tokenizer {
  private static final Logger sLogger = Logger.getLogger(OpenNLPTokenizer.class);
  static{
    sLogger.setLevel(Level.WARN);
  }
  private Tokenizer tokenizer;
  private SnowballStemmer stemmer;
  private int lang;
  private static final int ENGLISH = 0, FRENCH = 1, GERMAN = 2;
  private static final String[] languages = {"english", "french", "german"};
  private Set<String> stopwords;
  private Set<String> stemmedStopwords;

  public OpenNLPTokenizer(){
    super();
  }

  @Override
  public void configure(Configuration conf){
    FileSystem fs;
    try {
      fs = FileSystem.get(conf);
    } catch (IOException e) {
      e.printStackTrace();
      throw new RuntimeException(e);
    }
    configure(conf, fs);
  }

  @Override
  public void configure(Configuration mJobConf, FileSystem fs){
    setTokenizer(fs, new Path(mJobConf.get(Constants.TokenizerData)));
    if (mJobConf.getBoolean(Constants.Stemming, true)) {
      setLanguageAndStemmer(mJobConf.get(Constants.Language));
    }else {
      setLanguage(mJobConf.get(Constants.Language));
    }

    // read stopwords from file (stopwords will be empty set if file does not exist or is empty)
    String stopwordsFile = mJobConf.get(Constants.StopwordList);
    stopwords = readInput(fs, stopwordsFile);     
    String stemmedStopwordsFile = mJobConf.get(Constants.StemmedStopwordList);
    stemmedStopwords = readInput(fs, stemmedStopwordsFile);

    VocabularyWritable vocab;
    try {
      vocab = (VocabularyWritable) HadoopAlign.loadVocab(new Path(mJobConf.get(Constants.CollectionVocab)), fs);
      setVocab(vocab);
    } catch (Exception e) {
      sLogger.warn("No vocabulary provided to tokenizer.");
      vocab = null;
    }
    isStopwordRemoval = !stopwords.isEmpty();
 
    sLogger.warn("Stemmer: " + stemmer + "\nStopword removal is " + isStopwordRemoval +"; number of stopwords: " + stopwords.size() +"; stemmed: " + stemmedStopwords.size());
  }

  public void setTokenizer(FileSystem fs, Path p){
    try {
      FSDataInputStream in = fs.open(p);
      TokenizerModel model;
      model = new TokenizerModel(in);
      tokenizer = new TokenizerME(model);
    }
    catch (IOException e) {
      e.printStackTrace();
    }
  }

  public void setLanguage(String l){
    if(l.startsWith("en")){
      lang = ENGLISH;//"english";
    }else if(l.startsWith("fr")){
      lang = FRENCH;//"french";
    }else if(l.equals("german") || l.startsWith("de")){
      lang = GERMAN;//"german";
    }else{
      sLogger.warn("Language not recognized, setting to English!");
    }
  }

  @SuppressWarnings("unchecked")
  public void setLanguageAndStemmer(String l){
    if(l.startsWith("en")){
      lang = ENGLISH;//"english";
    }else if(l.startsWith("fr")){
      lang = FRENCH;//"french";
    }else if(l.equals("german") || l.startsWith("de")){
      lang = GERMAN;//"german";
    }else{
      sLogger.warn("Language not recognized, setting to English!");
    }
    Class stemClass;
    try {
      stemClass = Class.forName("org.tartarus.snowball.ext." +
          languages[lang] + "Stemmer");
      stemmer = (SnowballStemmer) stemClass.newInstance();
    } catch (ClassNotFoundException e) {
      sLogger.warn("Stemmer class not recognized!\n"+"org.tartarus.snowball.ext." +
          languages[lang] + "Stemmer");
      stemmer = null;
      return;
    } catch (Exception e) {
      e.printStackTrace();
      throw new RuntimeException(e);
    }
  }

  @Override
  public String[] processContent(String text) {
    text = preNormalize(text);
    if ( lang == FRENCH ) {
      text = text.replaceAll("'", "' ");    // openNLP does not separate what comes after the apostrophe, which seems to work better
    }

    String[] tokens = tokenizer.tokenize(text);
    String tokenizedText = "";
    for ( String token : tokens ){
      tokenizedText += token + " ";
    }

    // do post-normalizations before any stemming or stopword removal
    String[] normalizedTokens = postNormalize(tokenizedText).split(" ");
    tokenizedText = "";
    for ( int i = 0; i < normalizedTokens.length; i++ ){
      String token = normalizedTokens[i].toLowerCase();
      if ( isStopwordRemoval && isDiscard(token) ) {
//        sLogger.warn("Discarded stopword "+token);
        continue;
      }

      //apply stemming on token
      String stemmedToken = token;
      if ( stemmer!=null ) {
        stemmer.setCurrent(token);
        stemmer.stem();
        stemmedToken = stemmer.getCurrent();
      }

      //skip if out of vocab
      if ( vocab != null && vocab.get(stemmedToken) <= 0) {
        //        sLogger.warn("Discarded OOV "+token);
        continue;
      }
      tokenizedText += (stemmedToken + " ");
    }

    return tokenizedText.trim().split(" ");
  }

  public String getLanguage() {
    return languages[lang];
  }

  @Override
  public int getNumberTokens(String string){
    return tokenizer.tokenize(string).length;
  }

  private boolean isDiscard(String token) {
    // remove characters that may cause problems when processing further
    //    token = removeNonUnicodeChars(token);

    return ( token.length() < MIN_LENGTH || token.length() > MAX_LENGTH || delims.contains(token) || stopwords.contains(token) );
  }

  @Override
  public String stem(String token) {
    token = postNormalize(preNormalize(token)).toLowerCase();
    if ( stemmer!=null ) {
      stemmer.setCurrent(token);
      stemmer.stem();
      return stemmer.getCurrent();
    }else {
      return token;
    }
  }

  /*
   * For external use. returns true if token is a Galago stopword or a delimiter: `~!@#$%^&*()-_=+]}[{\\|'\";:/?.>,<
   */
  @Override
  public boolean isStopWord(String token) {
    if (stopwords == null) {
      Log.warn("Tokenizer does not have stopwords loaded!");
      return false;
    }else {
      return ( stopwords.contains(token) || delims.contains(token) );
    }
  }

  @Override
  public boolean isStemmedStopWord(String token) {
    if (stemmedStopwords == null) {
      Log.warn("Tokenizer does not have stopwords loaded!");
      return false;
    }else {
      return ( stemmedStopwords.contains(token) || delims.contains(token) );
    }
  }

}
TOP

Related Classes of ivory.core.tokenize.OpenNLPTokenizer

TOP
Copyright © 2018 www.massapi.com. All rights reserved.
All source code are property of their respective owners. Java is a trademark of Sun Microsystems, Inc and owned by ORACLE Inc. Contact coftware#gmail.com.