/*
* Copyright 2012 DBpedia Spotlight Development Team
*
* Licensed under the Apache License, Version 2.0 (the "License");
* you may not use this file except in compliance with the License.
* You may obtain a copy of the License at
*
* http://www.apache.org/licenses/LICENSE-2.0
*
* Unless required by applicable law or agreed to in writing, software
* distributed under the License is distributed on an "AS IS" BASIS,
* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
* See the License for the specific language governing permissions and
* limitations under the License.
*
* Check our project website for information on how to acknowledge the authors and how to contribute to the project: http://spotlight.dbpedia.org
*/
package org.dbpedia.spotlight.spot;
import opennlp.tools.chunker.ChunkerModel;
import opennlp.tools.namefind.TokenNameFinderModel;
import opennlp.tools.postag.POSModel;
import opennlp.tools.sentdetect.SentenceModel;
import opennlp.tools.tokenize.TokenizerModel;
import opennlp.tools.util.InvalidFormatException;
import opennlp.tools.util.model.BaseModel;
import org.apache.commons.logging.Log;
import org.apache.commons.logging.LogFactory;
import org.dbpedia.spotlight.exceptions.ConfigurationException;
import java.io.File;
import java.io.FileInputStream;
import java.io.IOException;
import java.io.InputStream;
import java.util.ArrayList;
import java.util.List;
import java.util.regex.Matcher;
import java.util.regex.Pattern;
/**
* @author pablomendes
*/
public class OpenNLPUtil {
private static final Log LOG = LogFactory.getLog(OpenNLPUtil.class);
public enum OpenNlpModels {
SentenceModel("-sent"),
ChunkModel("-chunker"),
TokenizerModel("-token"),
POSModel("-pos-maxent"),
person("-ner-person"),
organization("-ner-organization"),
location("-ner-location");
private final String name; // filename
OpenNlpModels(String fname) {
this.name = fname;
}
public String filename() { return name + ".bin"; }
public File file() { return new File(name); }
}
public static BaseModel loadOpenNlpModel(String modelType, InputStream in) throws IOException {
OpenNLPUtil.OpenNlpModels m = OpenNLPUtil.OpenNlpModels.valueOf(modelType);
BaseModel mdl = loadgivenmodeltype( m, in);
return mdl;
}
/**Loads OpenNLP 5 models.
* @param directoryPath Path of the FS directory. Used when creating/opening an InputStream to a file
* model file in the folder (direct file reading)
* @param modelRelativePath This is the to the model file starting from a resource folder (i.e. when reading
* from a jar, this is the path of the model file in the jar file followed by the model file name.
* e.g. in case if model files are in a folder named "opennlp" in the jar file, then we can set "opennlp"
* to directorypath and "english/en-sent.zip" to model relativepath (note the modelfile en-sent.zip) is
* assumed to to be in opennlp/english/en-sent.zip.
* @param modelType
* @return
* @throws IOException
*/
protected static BaseModel loadModel(String directoryPath, String modelRelativePath, String modelType) throws ConfigurationException {
ClassLoader loader = OpenNLPUtil.class.getClassLoader();
InputStream in = null;
try {
if (directoryPath != null && directoryPath.length() > 0) {
// load custom models from the provided FS directory
File modelData = new File(new File(directoryPath), modelRelativePath);
in = new FileInputStream(modelData);
LOG.debug("**OpenNLP is Loading OpenNLP 1.5 " + modelType + " from a given directory path: " + modelData.getAbsolutePath());
} else {
// load default OpenNLP models from jars
String resourcePath = "opennlp/" + modelRelativePath;
in = loader.getResourceAsStream(resourcePath);
LOG.debug("**OpenNLP is Loading OpenNLP 1.5 " + modelType + " model by Regular class loading: " + in.getClass().getCanonicalName());
if (in == null) {
throw new IOException("could not find resource: " + resourcePath);
}
}
return loadOpenNlpModel(modelType, in);
} catch (IOException e) {
throw new ConfigurationException("Could not load OpenNLP Model file.");
}
}
protected static BaseModel loadgivenmodeltype(OpenNlpModels m, InputStream in) throws InvalidFormatException, IOException {
BaseModel mdl = null;
switch(m) {
case TokenizerModel: {
mdl = new TokenizerModel(in);
LOG.debug("OpenNLP5 Tokenizer Model loaded: " + mdl);
break;
}
case POSModel: {
mdl = new POSModel(in);
LOG.debug("OpenNLP5 POS Model loaded: " + mdl);
break;
}
case SentenceModel: {
mdl = new SentenceModel(in);
LOG.debug("OpenNLP5 Sentence Model loaded: " + mdl);
break;
}
case ChunkModel: {
mdl = new ChunkerModel(in);
LOG.debug("OpenNLP5 Sentence Model loaded: " + mdl);
break;
}
case person:
case organization:
case location:
{
mdl = new TokenNameFinderModel(in);
LOG.debug("OpenNLP5 TokenNameFinderModel Model loaded: " + mdl);
break;
}
default: LOG.debug("Unknown Model Type!");
}
return mdl;
}
protected static int computeOffset(String orgText, int newoffset, List<Integer> remidxes) {
int offset = -1;
int notremoved = 0;
int removed = 0;
for (int i = 0; i<orgText.length() && notremoved <= newoffset; i++) {
if (remidxes.contains(new Integer(i))) {
removed++;
} else {
notremoved++;
}
}
offset = newoffset + removed;
return offset;
}
protected static List<Integer> chars2remove(String orgText) {
//See: http://en.wikipedia.org/wiki/Quotation_mark_glyphs
char[] charArray = { '"','\u002C','\u00AB','\u00BB','\u2018','\u2019','\u201A','\u201B','\u201C','\u201D','\u201E','\u201F','\u2039','\u203A'};
String regexp = "[";
for (Character ch: charArray) {
regexp = regexp + ch;
}
regexp = regexp + "]";
//System.out.println("\nregexp: " + regexp);
List<Integer> remCharPosLst = new ArrayList<Integer>();
Pattern p = Pattern.compile(regexp);
Matcher m = p.matcher(orgText);
while (!m.hitEnd()) {
boolean mth = m.find();
if (mth) {
//System.out.println("Charater to remove: " + orgText.charAt(m.start()));
remCharPosLst.add(m.start());
}
}
return remCharPosLst;
}
protected static String cleanText(String orgTxt, List<Integer> remCharIdxes) {
String cleanTxt="";
int start = 0;
for (int idx: remCharIdxes) {
cleanTxt = cleanTxt + orgTxt.substring(start, idx);
start = idx + 1;
}
cleanTxt = cleanTxt + orgTxt.substring(start);
return cleanTxt;
}
}