Package ivory.core.tokenize

Source Code of ivory.core.tokenize.OpenNLPTokenizer

package ivory.core.tokenize;


import ivory.core.util.CLIRUtils;

import java.io.IOException;
import java.util.ArrayList;
import java.util.List;

import opennlp.tools.tokenize.Tokenizer;
import opennlp.tools.tokenize.TokenizerME;
import opennlp.tools.tokenize.TokenizerModel;

import org.apache.hadoop.conf.Configuration;
import org.apache.hadoop.fs.FSDataInputStream;
import org.apache.hadoop.fs.FileSystem;
import org.apache.hadoop.fs.Path;
import org.apache.log4j.Level;
import org.apache.log4j.Logger;
import org.tartarus.snowball.SnowballStemmer;


import edu.umd.hooka.VocabularyWritable;
import edu.umd.hooka.alignment.HadoopAlign;

public class OpenNLPTokenizer implements ivory.core.tokenize.Tokenizer {
  private static final Logger sLogger = Logger.getLogger(DocumentProcessingUtils.class);
  static{
    sLogger.setLevel(Level.WARN);
  }
  Tokenizer tokenizer;
  SnowballStemmer stemmer;
  String lang;
  protected static int NUM_PREDS, MIN_LENGTH = 2, MAX_LENGTH = 50;
  String delims = "`~!@#$%^&*()-_=+]}[{\\|'\";:/?.>,<";
  VocabularyWritable vocab;

  public OpenNLPTokenizer(){
    super();
  }

  public void configure(Configuration mJobConf){
    FileSystem fs;
    try {
      fs = FileSystem.get(mJobConf);
    } catch (IOException e) {
      e.printStackTrace();
      throw new RuntimeException(e);
    }
    setTokenizer(fs, new Path(mJobConf.get("Ivory.TokenizerModel")));
    setLanguageAndStemmer(mJobConf.get("Ivory.Lang"));
    VocabularyWritable vocab;
    try {
      vocab = (VocabularyWritable) HadoopAlign.loadVocab(new Path(mJobConf.get("Ivory.CollectionVocab")), fs);
      setVocab(vocab);
    } catch (IOException e) {
      sLogger.warn("VOCAB IS NULL!");
      vocab = null;
    }
  }

  public void setTokenizer(FileSystem fs, Path p){
    try {
      FSDataInputStream in = fs.open(p);
      TokenizerModel model;
      model = new TokenizerModel(in);
      tokenizer = new TokenizerME(model);
    }
    catch (IOException e) {
      e.printStackTrace();
    }
  }

  public void setLanguageAndStemmer(String l){
    if(l.startsWith("en")){
      lang = "english";
    }else if(l.startsWith("fr")){
      lang = "french";
    }else if(l.equals("german") || l.startsWith("de")){
      lang = "german";
    }else{
      sLogger.warn("Language not recognized!");
    }
    Class stemClass;
    try {
      stemClass = Class.forName("org.tartarus.snowball.ext." +
          lang + "Stemmer");
      stemmer = (SnowballStemmer) stemClass.newInstance();
    } catch (ClassNotFoundException e) {
      sLogger.warn("Stemmer class not recognized!\n"+"org.tartarus.snowball.ext." +
          lang + "Stemmer");
      stemmer = null;
      return;
    } catch (Exception e) {
      e.printStackTrace();
      throw new RuntimeException(e);
    }
  }

  public void setVocab(VocabularyWritable v){
    vocab = v;
  }

  public String[] processContent(String text) {
    String[] tokens = tokenizer.tokenize(text);
    List<String> stemmedTokens = new ArrayList<String>();
    for(String token : tokens){
      token = removeNonUnicodeChars(token);

      if(token.length() < MIN_LENGTH || token.length() > MAX_LENGTH || delims.contains(token))  continue;
      String stemmed = token;
      if(stemmer!=null){
        stemmer.setCurrent(token);
        stemmer.stem();
        stemmed = stemmer.getCurrent().toLowerCase();
      }
      //skip if out of vocab
      if(vocab!=null && vocab.get(stemmed)<=0){
        continue;
      }
      stemmedTokens.add(stemmed);
    }

    String[] tokensArray = new String[stemmedTokens.size()];
    int i=0;
    for(String ss : stemmedTokens){
      tokensArray[i++]=ss;
    }
    return tokensArray;

  }
 
  public String processContent2(String text) {
    String[] tokens = tokenizer.tokenize(text);
    String line = "";
    for(String token : tokens){
      token = removeNonUnicodeChars(token);
      String stemmed = token;
      if(stemmer!=null){
        stemmer.setCurrent(token);
        stemmer.stem();
        stemmed = stemmer.getCurrent().toLowerCase();
      }
      if(!stemmed.isEmpty()){
        line += stemmed+" ";
      }
    }
    return line;
  }
 
  private String removeNonUnicodeChars(String token) {
    StringBuffer fixedToken = new StringBuffer();
    for(int i=0; i<token.length(); i++){
      char c = token.charAt(i);
      if(Character.getNumericValue(c)>=0){
        fixedToken.append(c);
      }
    }
    return fixedToken.toString();
  }

  public static void main(String[] args) throws IOException, ClassNotFoundException, InstantiationException, IllegalAccessException{
    String s ="965 v. Chr.\n#redirect [[10. Jahrhundert v. Chr.]]";
    //    OpenNLPTokenizer eTokenizer = new OpenNLPTokenizer();
    //    eTokenizer.setTokenizer(FileSystem.get(new Configuration()), new Path("/Users/ferhanture/edu/research/programs/opennlp-tools-1.4.3/models/EnglishTok.bin"));
    //    eTokenizer.setLanguageAndStemmer("english");
    //    eTokenizer.setIsWiki(true);
    //    FileSystem localFs=null;
    //    try {
    //      localFs = FileSystem.get(new Configuration());
    //    } catch (IOException e2) {
    //    }
    //    String indexPath = "/Users/ferhanture/edu/research/data/de-en/eu-nc-wmt08";
    //    String srcVocabFile = indexPath+"/berkeleyaligner.vocab.eng";
    //    VocabularyWritable engVocabH = (VocabularyWritable) HadoopAlign.loadVocab(new Path(srcVocabFile), localFs);
    //
    //
    //    eTokenizer.setVocab(engVocabH);
    //    String[] ss = eTokenizer.processContent(s);
    //    System.out.println();

    OpenNLPTokenizer eTokenizer = new OpenNLPTokenizer();
    eTokenizer.setTokenizer(FileSystem.get(new Configuration()), new Path("/Users/ferhanture/edu/research/programs/opennlp-tools-1.4.3/models/GermanTok.bin"));
    eTokenizer.setLanguageAndStemmer("german");
//    eTokenizer.setIsWiki(true);
    FileSystem localFs=null;
    try {
      localFs = FileSystem.get(new Configuration());
    } catch (IOException e2) {
    }
    String indexPath = "/Users/ferhanture/edu/research/data/de-en/eu-nc-wmt08";
    String srcVocabFile = indexPath+"/berkeleyaligner.vocab.ger";
    VocabularyWritable engVocabH = (VocabularyWritable) HadoopAlign.loadVocab(new Path(srcVocabFile), localFs);


    eTokenizer.setVocab(engVocabH);
    String[] ss = eTokenizer.processContent(s);
    System.out.println();
  }
}
TOP

Related Classes of ivory.core.tokenize.OpenNLPTokenizer

TOP
Copyright © 2018 www.massapi.com. All rights reserved.
All source code are property of their respective owners. Java is a trademark of Sun Microsystems, Inc and owned by ORACLE Inc. Contact coftware#gmail.com.