Source Code of org.apache.ctakes.postagger.POSTagger

/**
 * Licensed to the Apache Software Foundation (ASF) under one
 * or more contributor license agreements.  See the NOTICE file
 * distributed with this work for additional information
 * regarding copyright ownership.  The ASF licenses this file
 * to you under the Apache License, Version 2.0 (the
 * "License"); you may not use this file except in compliance
 * with the License.  You may obtain a copy of the License at
 *
 *   http://www.apache.org/licenses/LICENSE-2.0
 *
 * Unless required by applicable law or agreed to in writing,
 * software distributed under the License is distributed on an
 * "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
 * KIND, either express or implied.  See the License for the
 * specific language governing permissions and limitations
 * under the License.
 */
/*
 * Licensed to the Apache Software Foundation (ASF) under one
 * or more contributor license agreements.  See the NOTICE file
 * distributed with this work for additional information
 * regarding copyright ownership.  The ASF licenses this file
 * to you under the Apache License, Version 2.0 (the
 * "License"); you may not use this file except in compliance
 * with the License.  You may obtain a copy of the License at
 * 
 *   http://www.apache.org/licenses/LICENSE-2.0
 * 
 * Unless required by applicable law or agreed to in writing,
 * software distributed under the License is distributed on an
 * "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
 * KIND, either express or implied.  See the License for the
 * specific language governing permissions and limitations
 * under the License.
 */


/**
 * This class was derived directly from the example annotator provided by the Apache
 * UIMA distribution 2.2.1 in the opennlp_wrappers directory of the uimaj-examples project.
 * 
 * The following changes have been made:
 * - import of different sentence and token types.
 * - removed original comments
 * - added TAG_DICIONARY_PARAM
 * - added CASE_SENSITIVE_PARAM
 * - typed the collections used in process
 * - throws an exception instead of printing out an error message.
 * 
 * Please read the README in the top-level directory of this project for further details.  
 */


package org.apache.ctakes.postagger;


import java.io.File;
import java.io.FileInputStream;
import java.util.ArrayList;
import java.util.List;


//import opennlp.tools.lang.english.PosTagger;
import opennlp.model.AbstractModel;
import opennlp.tools.postag.POSDictionary;
import opennlp.tools.postag.POSModel;
import opennlp.tools.postag.TagDictionary;


import org.apache.log4j.Logger;
import org.apache.uima.UimaContext;
import org.apache.uima.analysis_component.JCasAnnotator_ImplBase;
import org.apache.uima.analysis_engine.AnalysisEngineProcessException;
import org.apache.uima.cas.FSIterator;
import org.apache.uima.cas.text.AnnotationIndex;
import org.apache.uima.jcas.JCas;
import org.apache.uima.resource.ResourceInitializationException;


import org.apache.ctakes.core.resource.FileLocator;
import org.apache.ctakes.typesystem.type.syntax.BaseToken;
import org.apache.ctakes.typesystem.type.textspan.Sentence;


public class POSTagger extends JCasAnnotator_ImplBase {


  // LOG4J logger based on class name
  private Logger logger = Logger.getLogger(getClass().getName());


  /**
   * "PosModelFile" is a required, single, string parameter that contains the
   * file name of the part of speech tagger model. The model file name should
   * end with ".bin.gz" or ".txt". If this is not the case, then please see
   * resources/models/README.
   */
  public static final String POS_MODEL_FILE_PARAM = "PosModelFile";


  /**
   * "TagDictionary" is an optional, single, string parameter that contains
   * the file name of the part-of-speech tag dictionary. For relevant
   * discussion of the difference between a "tag dictionary" and a
   * "dictionary" see:
   * <p><ul>
   * <li>
   * <a href="https://sourceforge.net/forum/forum.php?thread_id=1720863&forum_id=9943">PosTagger - with/without dictionaries? and ..</a>
   * <li> 
   * </li>
   * <a href="https://sourceforge.net/forum/forum.php?thread_id=1894043&forum_id=9943">Create a new dict file</a>
   * </li>
   * </ul>
   * For information about how to create a TagDictionary, please see the
   * README in top-level directory of this project.
   * 
   * @see TagDictionary
   * @see POSDictionary
   */
  public static final String TAG_DICTIONARY_PARAM = "TagDictionary";


  /**
   * "CaseSensitive" is a required, single, boolean parameter that specifies
   * how to access entries in the tag dictionary. If you give the value
   * "false", then you should really have a tag dictionary that is also case
   * insensitive. Please see the README in top-level directory of this project
   * for details on how to create a case insensitive tag dictionary.
   * 
   * <br>
   * This parameter has no effect if no tag dictionary is provided but is
   * required if a tag dictionary is provided.
   * 
   * @see POSDictionary#POSDictionary(String, boolean)
   * 
   */
  public static final String CASE_SENSITIVE_PARAM = "CaseSensitive";


  private opennlp.tools.postag.POSTaggerME tagger;


  public void initialize(UimaContext uimaContext)  throws ResourceInitializationException {
    super.initialize(uimaContext);


    String posModelPath = null;


    try {
      posModelPath = (String) uimaContext.getConfigParameterValue(POS_MODEL_FILE_PARAM);
      File posModelFile = FileLocator.locateFile(posModelPath);
      String modelFileAbsPath = posModelFile.getAbsolutePath();
      logger.info("POS tagger model file: " + modelFileAbsPath);


      boolean caseSensitive = (Boolean) uimaContext.getConfigParameterValue(CASE_SENSITIVE_PARAM);
      String tagDictionaryPath = (String) uimaContext.getConfigParameterValue(TAG_DICTIONARY_PARAM);


      TagDictionary tagDictionary = null;
      if (tagDictionaryPath != null && !tagDictionaryPath.trim().equals("")) {
        File tagDictFile = FileLocator.locateFile(tagDictionaryPath);
        String tagDictFileAbsPath = tagDictFile.getAbsolutePath();
        logger.info("POS tagger tag-dictionary: " + tagDictFileAbsPath);


        tagDictionary = new POSDictionary(tagDictFileAbsPath, caseSensitive);
      } else {
        logger.info("No POS tagger tag-dictionary.");
      }


      FileInputStream fis = new FileInputStream(posModelFile);
      POSModel modelFile = new POSModel(fis); // skip using the tag dictionary for now since OpenNLP (1.5) changed
      tagger = new opennlp.tools.postag.POSTaggerME(modelFile); //, tagDictionary);


    } catch (Exception e) {
      logger.info("POS tagger model: " + posModelPath);
      throw new ResourceInitializationException(e);
    }
  }


  public void process(JCas jCas) throws AnalysisEngineProcessException {


    logger.info("process(JCas)");


    List<BaseToken> tokens = new ArrayList<BaseToken>();
    List<String> words = new ArrayList<String>();


    AnnotationIndex baseTokenIndex = jCas.getAnnotationIndex(BaseToken.type);


    FSIterator sentences = jCas.getAnnotationIndex(Sentence.type).iterator();


    while (sentences.hasNext()) {
      Sentence sentence = (Sentence) sentences.next();


      tokens.clear();
      words.clear();


      FSIterator tokenIterator = baseTokenIndex.subiterator(sentence);
      while (tokenIterator.hasNext()) {
        BaseToken token = (BaseToken) tokenIterator.next();
        tokens.add(token);
        words.add(token.getCoveredText());
      }


      List<?> wordTagList = null; // List of BaseToken's
      if (words.size() > 0) {
        wordTagList = tagger.tag(words);
      }
      // else {
      //   logger.info("sentence has no words = '" + sentence.getCoveredText()
      //     + "' at (" +sentence.getBegin() + "," + sentence.getEnd() + ")");
      // }


      try {
        for (int i = 0; i < tokens.size(); i++) {
          BaseToken token = (BaseToken) tokens.get(i);
          String posTag = (String) wordTagList.get(i);
          token.setPartOfSpeech(posTag);
        }
      } catch (IndexOutOfBoundsException e) {
        throw new AnalysisEngineProcessException(
            "sentence being tagged is: '" + sentence.getCoveredText() + "'", null, e);
      }
    }
  }
}
Source Code of org.apache.ctakes.postagger.POSTagger

Related Classes of org.apache.ctakes.postagger.POSTagger