Package org.apache.ctakes.postagger

Source Code of org.apache.ctakes.postagger.POSTagger

/**
* Licensed to the Apache Software Foundation (ASF) under one
* or more contributor license agreements.  See the NOTICE file
* distributed with this work for additional information
* regarding copyright ownership.  The ASF licenses this file
* to you under the Apache License, Version 2.0 (the
* "License"); you may not use this file except in compliance
* with the License.  You may obtain a copy of the License at
*
*   http://www.apache.org/licenses/LICENSE-2.0
*
* Unless required by applicable law or agreed to in writing,
* software distributed under the License is distributed on an
* "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
* KIND, either express or implied.  See the License for the
* specific language governing permissions and limitations
* under the License.
*/
/*
* Licensed to the Apache Software Foundation (ASF) under one
* or more contributor license agreements.  See the NOTICE file
* distributed with this work for additional information
* regarding copyright ownership.  The ASF licenses this file
* to you under the Apache License, Version 2.0 (the
* "License"); you may not use this file except in compliance
* with the License.  You may obtain a copy of the License at
*
*   http://www.apache.org/licenses/LICENSE-2.0
*
* Unless required by applicable law or agreed to in writing,
* software distributed under the License is distributed on an
* "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
* KIND, either express or implied.  See the License for the
* specific language governing permissions and limitations
* under the License.
*/

/**
* This class was derived directly from the example annotator provided by the Apache
* UIMA distribution 2.2.1 in the opennlp_wrappers directory of the uimaj-examples project.
*
* The following changes have been made:
* - import of different sentence and token types.
* - removed original comments
* - added TAG_DICIONARY_PARAM
* - added CASE_SENSITIVE_PARAM
* - typed the collections used in process
* - throws an exception instead of printing out an error message.
*
* Please read the README in the top-level directory of this project for further details. 
*/

package org.apache.ctakes.postagger;

import java.io.File;
import java.io.FileInputStream;
import java.util.ArrayList;
import java.util.List;

//import opennlp.tools.lang.english.PosTagger;
import opennlp.model.AbstractModel;
import opennlp.tools.postag.POSDictionary;
import opennlp.tools.postag.POSModel;
import opennlp.tools.postag.TagDictionary;

import org.apache.log4j.Logger;
import org.apache.uima.UimaContext;
import org.apache.uima.analysis_component.JCasAnnotator_ImplBase;
import org.apache.uima.analysis_engine.AnalysisEngineProcessException;
import org.apache.uima.cas.FSIterator;
import org.apache.uima.cas.text.AnnotationIndex;
import org.apache.uima.jcas.JCas;
import org.apache.uima.resource.ResourceInitializationException;

import org.apache.ctakes.core.resource.FileLocator;
import org.apache.ctakes.typesystem.type.syntax.BaseToken;
import org.apache.ctakes.typesystem.type.textspan.Sentence;

public class POSTagger extends JCasAnnotator_ImplBase {

  // LOG4J logger based on class name
  private Logger logger = Logger.getLogger(getClass().getName());

  /**
   * "PosModelFile" is a required, single, string parameter that contains the
   * file name of the part of speech tagger model. The model file name should
   * end with ".bin.gz" or ".txt". If this is not the case, then please see
   * resources/models/README.
   */
  public static final String POS_MODEL_FILE_PARAM = "PosModelFile";

  /**
   * "TagDictionary" is an optional, single, string parameter that contains
   * the file name of the part-of-speech tag dictionary. For relevant
   * discussion of the difference between a "tag dictionary" and a
   * "dictionary" see:
   * <p><ul>
   * <li>
   * <a href="https://sourceforge.net/forum/forum.php?thread_id=1720863&forum_id=9943">PosTagger - with/without dictionaries? and ..</a>
   * <li>
   * </li>
   * <a href="https://sourceforge.net/forum/forum.php?thread_id=1894043&forum_id=9943">Create a new dict file</a>
   * </li>
   * </ul>
   * For information about how to create a TagDictionary, please see the
   * README in top-level directory of this project.
   *
   * @see TagDictionary
   * @see POSDictionary
   */
  public static final String TAG_DICTIONARY_PARAM = "TagDictionary";

  /**
   * "CaseSensitive" is a required, single, boolean parameter that specifies
   * how to access entries in the tag dictionary. If you give the value
   * "false", then you should really have a tag dictionary that is also case
   * insensitive. Please see the README in top-level directory of this project
   * for details on how to create a case insensitive tag dictionary.
   *
   * <br>
   * This parameter has no effect if no tag dictionary is provided but is
   * required if a tag dictionary is provided.
   *
   * @see POSDictionary#POSDictionary(String, boolean)
   *
   */
  public static final String CASE_SENSITIVE_PARAM = "CaseSensitive";

  private opennlp.tools.postag.POSTaggerME tagger;

  public void initialize(UimaContext uimaContextthrows ResourceInitializationException {
    super.initialize(uimaContext);

    String posModelPath = null;

    try {
      posModelPath = (String) uimaContext.getConfigParameterValue(POS_MODEL_FILE_PARAM);
      File posModelFile = FileLocator.locateFile(posModelPath);
      String modelFileAbsPath = posModelFile.getAbsolutePath();
      logger.info("POS tagger model file: " + modelFileAbsPath);

      boolean caseSensitive = (Boolean) uimaContext.getConfigParameterValue(CASE_SENSITIVE_PARAM);
      String tagDictionaryPath = (String) uimaContext.getConfigParameterValue(TAG_DICTIONARY_PARAM);

      TagDictionary tagDictionary = null;
      if (tagDictionaryPath != null && !tagDictionaryPath.trim().equals("")) {
        File tagDictFile = FileLocator.locateFile(tagDictionaryPath);
        String tagDictFileAbsPath = tagDictFile.getAbsolutePath();
        logger.info("POS tagger tag-dictionary: " + tagDictFileAbsPath);

        tagDictionary = new POSDictionary(tagDictFileAbsPath, caseSensitive);
      } else {
        logger.info("No POS tagger tag-dictionary.");
      }

      FileInputStream fis = new FileInputStream(posModelFile);
      POSModel modelFile = new POSModel(fis); // skip using the tag dictionary for now since OpenNLP (1.5) changed
      tagger = new opennlp.tools.postag.POSTaggerME(modelFile); //, tagDictionary);

    } catch (Exception e) {
      logger.info("POS tagger model: " + posModelPath);
      throw new ResourceInitializationException(e);
    }
  }

  public void process(JCas jCas) throws AnalysisEngineProcessException {

    logger.info("process(JCas)");

    List<BaseToken> tokens = new ArrayList<BaseToken>();
    List<String> words = new ArrayList<String>();

    AnnotationIndex baseTokenIndex = jCas.getAnnotationIndex(BaseToken.type);

    FSIterator sentences = jCas.getAnnotationIndex(Sentence.type).iterator();

    while (sentences.hasNext()) {
      Sentence sentence = (Sentence) sentences.next();

      tokens.clear();
      words.clear();

      FSIterator tokenIterator = baseTokenIndex.subiterator(sentence);
      while (tokenIterator.hasNext()) {
        BaseToken token = (BaseToken) tokenIterator.next();
        tokens.add(token);
        words.add(token.getCoveredText());
      }

      List<?> wordTagList = null; // List of BaseToken's
      if (words.size() > 0) {
        wordTagList = tagger.tag(words);
      }
      // else {
      //   logger.info("sentence has no words = '" + sentence.getCoveredText()
      //     + "' at (" +sentence.getBegin() + "," + sentence.getEnd() + ")");
      // }

      try {
        for (int i = 0; i < tokens.size(); i++) {
          BaseToken token = (BaseToken) tokens.get(i);
          String posTag = (String) wordTagList.get(i);
          token.setPartOfSpeech(posTag);
        }
      } catch (IndexOutOfBoundsException e) {
        throw new AnalysisEngineProcessException(
            "sentence being tagged is: '" + sentence.getCoveredText() + "'", null, e);
      }
    }
  }
}
TOP

Related Classes of org.apache.ctakes.postagger.POSTagger

TOP
Copyright © 2018 www.massapi.com. All rights reserved.
All source code are property of their respective owners. Java is a trademark of Sun Microsystems, Inc and owned by ORACLE Inc. Contact coftware#gmail.com.