Source Code of org.dbpedia.spotlight.spot.OpenNLPNGramSpotter

/*
 * Copyright 2012 DBpedia Spotlight Development Team
 *
 *  Licensed under the Apache License, Version 2.0 (the "License");
 *  you may not use this file except in compliance with the License.
 *  You may obtain a copy of the License at
 *
 *  http://www.apache.org/licenses/LICENSE-2.0
 *
 *  Unless required by applicable law or agreed to in writing, software
 *  distributed under the License is distributed on an "AS IS" BASIS,
 *  WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 *  See the License for the specific language governing permissions and
 *  limitations under the License.
 *
 *  Check our project website for information on how to acknowledge the authors and how to contribute to the project: http://spotlight.dbpedia.org
 */


package org.dbpedia.spotlight.spot;


import opennlp.tools.chunker.ChunkerME;
import opennlp.tools.chunker.ChunkerModel;
import opennlp.tools.postag.POSModel;
import opennlp.tools.postag.POSTaggerME;
import opennlp.tools.sentdetect.SentenceDetectorME;
import opennlp.tools.sentdetect.SentenceModel;
import opennlp.tools.tokenize.TokenizerME;
import opennlp.tools.tokenize.TokenizerModel;
import opennlp.tools.util.Span;
import opennlp.tools.util.model.BaseModel;
import org.apache.commons.logging.Log;
import org.apache.commons.logging.LogFactory;
import org.dbpedia.spotlight.exceptions.ConfigurationException;
import org.dbpedia.spotlight.model.SpotlightConfiguration;
import org.dbpedia.spotlight.model.SurfaceForm;
import org.dbpedia.spotlight.model.SurfaceFormOccurrence;
import org.dbpedia.spotlight.model.Text;


import java.util.*;


/**
 * Consider as spots only the expressions marked as:
 * - named entities by the NER tagger,
 * - the noun-phrase chunks extracted by a shallow parser,
 * - all sub-expressions of up to 5 tokens of the noun-phrase chunks.
 *
 * This increases the coverage of NESpotter, which tends to annotate very little.
 *
 * The main advantage of this approach against the dictionary-based approach that we use now is that
 * it will not require 8GB of RAM to store a dictionary at runtime.
 *
 * TODO requires us to deal with overlaps when generating the annotations
 * TODO do not break chunks within quotes.
 * TODO do not get n-grams from NER spots, only from NP chunk spots
 *
 * @author Rohana Rajapakse (GOSS Interactive Limited) - implemented the class
 * @author pablomendes - adapted to integrate with architecture, made stopwords configurable and case insensitive, adjusted logging
 */
public class OpenNLPNGramSpotter implements Spotter {


  private final Log LOG = LogFactory.getLog(this.getClass());


  protected static BaseModel sentenceModel = null;
  protected static BaseModel chunkModel = null;
  protected static BaseModel tokenModel = null;
  protected static BaseModel posModel = null;
  protected Set<String> stopWords = SpotlightConfiguration.DEFAULT_STOPWORDS;


  //String directoryPath = "C:/software/appservers/dbp-spotlight-trunk3/data/models/opennlp/";    //now reading from configuration properties


  String directoryPath = null;


  //Need OpenNLP modles. At present they are loaded in the constructor, but they should better be loaded at the startup of
  //dbpediaSpotlight to avoid re-loading them each time a request arrives. A singleton to hold the models would do.


  public OpenNLPNGramSpotter(String opennlpmodeldir,String i18nLanguageCode) throws ConfigurationException {
        //directoryPath =  null; //for reading from dependency Jar files
        String directoryPath = opennlpmodeldir;


        if (OpenNLPNGramSpotter.sentenceModel == null) {
            OpenNLPNGramSpotter.sentenceModel  = OpenNLPUtil.loadModel(directoryPath, i18nLanguageCode +  OpenNLPUtil.OpenNlpModels.SentenceModel.filename(), OpenNLPUtil.OpenNlpModels.SentenceModel.toString());
        }
        if (OpenNLPNGramSpotter.chunkModel == null) {
            OpenNLPNGramSpotter.chunkModel  = OpenNLPUtil.loadModel(directoryPath, i18nLanguageCode +  OpenNLPUtil.OpenNlpModels.ChunkModel.filename(), OpenNLPUtil.OpenNlpModels.ChunkModel.toString());
        }
        if (OpenNLPNGramSpotter.posModel == null) {
            OpenNLPNGramSpotter.posModel  = OpenNLPUtil.loadModel(directoryPath, i18nLanguageCode +  OpenNLPUtil.OpenNlpModels.POSModel.filename(), OpenNLPUtil.OpenNlpModels.POSModel.toString());
        }
        if (OpenNLPNGramSpotter.tokenModel == null) {
            OpenNLPNGramSpotter.tokenModel  = OpenNLPUtil.loadModel(directoryPath, i18nLanguageCode +   OpenNLPUtil.OpenNlpModels.TokenizerModel.filename(), OpenNLPUtil.OpenNlpModels.TokenizerModel.toString());
        }


    }


  @Override
  public List<SurfaceFormOccurrence> extract(Text text) {


    //System.out.println("\n\nRR- extract(...) method called! with text: " + intext + "\n\n");
       
    //remove special chars from input text, and keep a list of positions of them n a list.
    //start/end offsets need to be adjusted after extracting spots from cleaned text.
    String orgText = text.text();
    List<Integer> chars2removeLst = OpenNLPUtil.chars2remove(orgText);
    String cleanText = OpenNLPUtil.cleanText(orgText, chars2removeLst);
    Text cleanTextStr = new Text(cleanText);
    //extracting NounPhrase nGrams
    List<SurfaceFormOccurrence> npNgrams = extractNPNGrams(cleanTextStr);
    /*
    System.out.println("\n\nAll NGrams of sentence:");
    System.out.println(intext + "\n");
    for( SurfaceFormOccurrence ng: npNgrams) {
      System.out.println(ng.surfaceForm() + " [" + ng.textOffset() + "]");
    }
     */
    
    
    if (npNgrams != null && !npNgrams.isEmpty()) {
      //lets correct the offsets
      for( SurfaceFormOccurrence ng: npNgrams) {
        int offset_clean = ng.textOffset();
        int offset_org = OpenNLPUtil.computeOffset(orgText, offset_clean, chars2removeLst);
        ng.setTextOffset(offset_org);
        //System.out.println(ng.surfaceForm() + " [" + ng.textOffset() + "]");
      }


      
      return npNgrams;
    }
    else {
            return (List) new ArrayList<String>();
        }
  }


    String name = "OpenNLPNGramSpotter";


  @Override
  public String getName() {
    return name;
  }
    @Override
    public void setName(String n) {
        this.name = n;
    }




  /**Extracts noun-phrase n-grams from the given piece of input text. 
   * @param text  A Text object containing the input from where to extract NP n-grams
   * @return A list of SurfaceFormOccurrence objects.
   */
  protected List<SurfaceFormOccurrence> extractNPNGrams(Text text) {
        String intext = text.text();
    //System.out.println("\n\nRR- nextractNPNGrams(...) method called! with text: " + intext + "\n\n");
    List<SurfaceFormOccurrence> npNgramSFLst = new ArrayList<SurfaceFormOccurrence>();
    SentenceDetectorME  sentenceDetector = new SentenceDetectorME((SentenceModel)sentenceModel);
    TokenizerME tokenizer = new TokenizerME((TokenizerModel)tokenModel);
    POSTaggerME posTagger = new POSTaggerME((POSModel)posModel);
    ChunkerME chunker = new ChunkerME((ChunkerModel)chunkModel);


    Span[] sentSpans = sentenceDetector.sentPosDetect(intext);
    for (Span sentSpan : sentSpans) {
      String sentence = sentSpan.getCoveredText(intext).toString();
      int start = sentSpan.getStart();
      Span[] tokSpans = tokenizer.tokenizePos(sentence);
      String[] tokens = new String[tokSpans.length];
      // System.out.println("\n\nTokens:");
      for (int i = 0; i < tokens.length; i++) {
        tokens[i] = tokSpans[i].getCoveredText(sentence).toString();
        // System.out.println(tokens[i]);
      }
      String[] tags = posTagger.tag(tokens);
      Span[] chunks = chunker.chunkAsSpans(tokens, tags);
      for (Span chunk : chunks) {
        if ("NP".equals(chunk.getType())) {
          //Note: getStart()/getEnd() methods of Chunk spans only give the start and end token indexes of the chunk.
          //The actual Start/End positions of the chunk in the sentence need to be extracted from POS sentenceSpans.
          //They are offsets from the begining of the sentence in question. Need to add the start postion of the sentence
          //to compute the actual start/end offsets from the begining of the input text.
          int begin = tokSpans[chunk.getStart()].getStart();
          int end =   tokSpans[chunk.getEnd() - 1].getEnd();
          List<Map<String,Integer>> ngrampos = extractNGramPos(chunk.getStart(),chunk.getEnd() + -1);
          extractNGrams(ngrampos, start, text, tokSpans, npNgramSFLst);
        }
      }
    }
    return npNgramSFLst;
  }
  
  public void extractNGrams(List<Map<String,Integer>> ngrampos, int start, Text text, Span[] tokSpans, List<SurfaceFormOccurrence> sfOccurrences) {
    String intext = text.text();
        for( Map<String,Integer> mapelem: ngrampos) {
      int starttokenidx = mapelem.get("start");
      int endtokenidx = mapelem.get("end");
      //restrict to max 3-word phrases
      int noftkens = endtokenidx - starttokenidx;
      boolean ignorephrase = false;
      int begin = start + tokSpans[starttokenidx].getStart();
      int end =   start + tokSpans[endtokenidx].getEnd();
      String txtform = intext.substring(begin,end);
            //ignore empty phrases
            if (txtform.trim().length()==0) {
                //System.out.println("empty txtform");
                continue;
            }
      //Ignore phrases that contain more than 3-terms. It is unlikely that such long phrases to hit any resources.
      //TODO Need to experiment with the cut-off value though.
      if ( noftkens > 2)   ignorephrase = true;
      //ignore phrases starting with a stopword
      int starttkn_begin = start + tokSpans[starttokenidx].getStart();
      int starttkn_end = start + tokSpans[starttokenidx].getEnd();
      String starttknTxt = intext.substring(starttkn_begin,starttkn_end);
      if (isStopWord(starttknTxt)) ignorephrase = true;
      //ignore phrases ending with a stopword
      int endtkn_begin = start + tokSpans[endtokenidx].getStart();
      int endtkn_end = start + tokSpans[endtokenidx].getEnd();
      String endtknTxt = intext.substring(endtkn_begin,endtkn_end);
      if (isStopWord(endtknTxt)) ignorephrase = true;


      if (!ignorephrase) {                
        NGram ng = new NGram(txtform, begin, end);
        SurfaceForm surfaceForm = new SurfaceForm(ng.getTextform());


                assert !ng.getTextform().isEmpty();


        SurfaceFormOccurrence sfocc =  new SurfaceFormOccurrence(surfaceForm, text, ng.getStart());
        if (surfaceForm.name().trim().length()>0 && !sfOccurrences.contains(sfocc)) {
          sfOccurrences.add(sfocc);
        }
      }
    }
  }
    
  /**Generates a list of start/end tokens (indexes) of all sub=phrases/n-grams, given start and end token indexes
   * e.g. if start token index and end token index are 5 and 7 (means token 5,6 and 7 makes up a noun phrase)
   *      then generate [5], [5,6], [5,6,7], [6], [6,7] and [7] as sub-phrases (n-grams) of the the original phrase.
   * @param startpos
   * @param endpos
   * @return A list of Maps. A Map element has only two keys "start" and "end".
   */
  public List<Map<String,Integer>>  extractNGramPos(int startpos, int endpos) {
    List<Map<String,Integer>> ngrampos1 = new ArrayList<Map<String,Integer>>();
    if (startpos <=endpos) {


      for (int i = startpos; i <=endpos; i++) {
        for (int j=i; j<=endpos;j++) {
          int start = i;
          int end = j;
          Map<String,Integer> posmap = new HashMap<String,Integer>();
          posmap.put("start", start);
          posmap.put("end", end);
          ngrampos1.add(posmap);
        }
      }
    }
    return ngrampos1;
  }  


  /**Uses the stopWords from Lucene (StopAnalyzer.ENGLISH_STOP_WORDS_SET) to find if a given piece of text is
   * a stopword.
   * @param word
   * @return true if the input text is a stopword.
   */
  private boolean isStopWord(String word) {
    boolean ret = false;
    ret = stopWords.contains(word.toLowerCase());
    return ret;
  }
  
}
Source Code of org.dbpedia.spotlight.spot.OpenNLPNGramSpotter

Related Classes of org.dbpedia.spotlight.spot.OpenNLPNGramSpotter