Source Code of org.dbpedia.spotlight.spot.OpenNLPUtil

/*
 * Copyright 2012 DBpedia Spotlight Development Team
 *
 *  Licensed under the Apache License, Version 2.0 (the "License");
 *  you may not use this file except in compliance with the License.
 *  You may obtain a copy of the License at
 *
 *  http://www.apache.org/licenses/LICENSE-2.0
 *
 *  Unless required by applicable law or agreed to in writing, software
 *  distributed under the License is distributed on an "AS IS" BASIS,
 *  WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 *  See the License for the specific language governing permissions and
 *  limitations under the License.
 *
 *  Check our project website for information on how to acknowledge the authors and how to contribute to the project: http://spotlight.dbpedia.org
 */


package org.dbpedia.spotlight.spot;


import opennlp.tools.chunker.ChunkerModel;
import opennlp.tools.namefind.TokenNameFinderModel;
import opennlp.tools.postag.POSModel;
import opennlp.tools.sentdetect.SentenceModel;
import opennlp.tools.tokenize.TokenizerModel;
import opennlp.tools.util.InvalidFormatException;
import opennlp.tools.util.model.BaseModel;
import org.apache.commons.logging.Log;
import org.apache.commons.logging.LogFactory;
import org.dbpedia.spotlight.exceptions.ConfigurationException;


import java.io.File;
import java.io.FileInputStream;
import java.io.IOException;
import java.io.InputStream;
import java.util.ArrayList;
import java.util.List;
import java.util.regex.Matcher;
import java.util.regex.Pattern;


/**
 * @author pablomendes
 */
public class OpenNLPUtil {


    private static final Log LOG = LogFactory.getLog(OpenNLPUtil.class);




    public enum OpenNlpModels {
        SentenceModel("-sent"),
        ChunkModel("-chunker"),
        TokenizerModel("-token"),
        POSModel("-pos-maxent"),
        person("-ner-person"),
        organization("-ner-organization"),
        location("-ner-location");


        private final String name; // filename
        OpenNlpModels(String fname) {
            this.name = fname;
        }
        public String filename()   { return name + ".bin"; }
        public File file()   { return new File(name); }


    }


    public static BaseModel loadOpenNlpModel(String modelType, InputStream in) throws IOException {
        OpenNLPUtil.OpenNlpModels m = OpenNLPUtil.OpenNlpModels.valueOf(modelType);
        BaseModel mdl = loadgivenmodeltype( m, in);
        return mdl;
    }


    /**Loads OpenNLP 5 models.
     * @param directoryPath Path of the FS directory. Used when creating/opening an InputStream to a file
     *        model file in the folder (direct file reading)
     * @param modelRelativePath This is the to the model file starting from a resource folder (i.e. when reading
     *   from a jar, this is the path of the model file in the jar file followed by the model file name.
     *   e.g. in case if model files are in a folder named "opennlp" in the jar file, then we can set "opennlp"
     *   to directorypath and "english/en-sent.zip" to model relativepath (note the modelfile en-sent.zip) is
     *   assumed to to be in opennlp/english/en-sent.zip.
     * @param modelType
     * @return
     * @throws IOException
     */
    protected static BaseModel loadModel(String directoryPath, String modelRelativePath, String modelType) throws ConfigurationException {
        ClassLoader loader = OpenNLPUtil.class.getClassLoader();
        InputStream in = null;
        try {
            if (directoryPath != null && directoryPath.length() > 0) {
                // load custom models from the provided FS directory
                File modelData = new File(new File(directoryPath),  modelRelativePath);
                in = new FileInputStream(modelData);
                LOG.debug("**OpenNLP is Loading OpenNLP 1.5 " + modelType + " from a given directory path: " + modelData.getAbsolutePath());
            } else {
                // load default OpenNLP models from jars
                String resourcePath = "opennlp/" + modelRelativePath;
                in = loader.getResourceAsStream(resourcePath);
                LOG.debug("**OpenNLP is Loading OpenNLP 1.5 " + modelType + " model by Regular class loading: " + in.getClass().getCanonicalName());
                if (in == null) {
                    throw new IOException("could not find resource: " + resourcePath);
                }
            }
            return loadOpenNlpModel(modelType, in);
        } catch (IOException e) {
            throw new ConfigurationException("Could not load OpenNLP Model file.");
        }
    }


    protected static BaseModel loadgivenmodeltype(OpenNlpModels m, InputStream in) throws InvalidFormatException, IOException {
        BaseModel mdl = null;
        switch(m) {
            case TokenizerModel: {
                mdl = new TokenizerModel(in);
                LOG.debug("OpenNLP5 Tokenizer Model loaded: " + mdl);
                break;
            }
            case POSModel: {
                mdl = new POSModel(in);
                LOG.debug("OpenNLP5 POS Model loaded: " + mdl);
                break;
            }
            case SentenceModel: {
                mdl = new SentenceModel(in);
                LOG.debug("OpenNLP5 Sentence Model loaded: " + mdl);
                break;
            }
            case ChunkModel: {
                mdl = new ChunkerModel(in);
                LOG.debug("OpenNLP5 Sentence Model loaded: " + mdl);
                break;
            }
            case person:
            case organization:
            case location:
            {
                mdl = new TokenNameFinderModel(in);
                LOG.debug("OpenNLP5 TokenNameFinderModel Model loaded: " + mdl);
                break;
            }
            default: LOG.debug("Unknown Model Type!");


        }
        return mdl;
    }
    
  protected static int computeOffset(String orgText, int newoffset, List<Integer> remidxes) {
    int offset = -1;
    int notremoved = 0;
    int removed = 0;
    for (int i = 0; i<orgText.length() && notremoved <= newoffset; i++) {
      if (remidxes.contains(new Integer(i))) {
        removed++;
      } else {
        notremoved++;
      }
    }
    
    offset = newoffset + removed;
    return offset;
  }  
  


  
  protected static List<Integer> chars2remove(String orgText) {
    
        //See: http://en.wikipedia.org/wiki/Quotation_mark_glyphs
        char[] charArray = { '"','\u002C','\u00AB','\u00BB','\u2018','\u2019','\u201A','\u201B','\u201C','\u201D','\u201E','\u201F','\u2039','\u203A'};
    String regexp = "[";
    for (Character ch: charArray) {
      regexp = regexp + ch;
    }
    regexp = regexp + "]";
    
    //System.out.println("\nregexp: " + regexp);
    List<Integer> remCharPosLst = new ArrayList<Integer>();


    Pattern p = Pattern.compile(regexp);
      Matcher m = p.matcher(orgText); 


      while (!m.hitEnd()) {
       boolean mth = m.find();
       if (mth) {
         //System.out.println("Charater to remove: " + orgText.charAt(m.start()));
         remCharPosLst.add(m.start());
       }
      }
    return remCharPosLst;
  }
  
  
  protected static String cleanText(String orgTxt, List<Integer> remCharIdxes) {
    String cleanTxt="";
    int start = 0;
    for (int idx: remCharIdxes) {
      cleanTxt = cleanTxt + orgTxt.substring(start, idx);
      start = idx + 1;
    }
    cleanTxt = cleanTxt + orgTxt.substring(start);
    
    return cleanTxt;
  }




}
Source Code of org.dbpedia.spotlight.spot.OpenNLPUtil

Related Classes of org.dbpedia.spotlight.spot.OpenNLPUtil