Package org.apache.ctakes.constituency.parser

Source Code of org.apache.ctakes.constituency.parser.MaxentParserWrapper

/**
* Licensed to the Apache Software Foundation (ASF) under one
* or more contributor license agreements.  See the NOTICE file
* distributed with this work for additional information
* regarding copyright ownership.  The ASF licenses this file
* to you under the Apache License, Version 2.0 (the
* "License"); you may not use this file except in compliance
* with the License.  You may obtain a copy of the License at
*
*   http://www.apache.org/licenses/LICENSE-2.0
*
* Unless required by applicable law or agreed to in writing,
* software distributed under the License is distributed on an
* "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
* KIND, either express or implied.  See the License for the
* specific language governing permissions and limitations
* under the License.
*/
package org.apache.ctakes.constituency.parser;

import java.io.File;
import java.io.FileInputStream;
import java.io.IOException;

import opennlp.tools.cmdline.parser.ParserTool;
import opennlp.tools.parser.AbstractBottomUpParser;
import opennlp.tools.parser.Parse;
import opennlp.tools.parser.ParserModel;
import opennlp.tools.parser.chunking.Parser;
import opennlp.tools.util.Span;

import org.apache.ctakes.constituency.parser.util.TreeUtils;
import org.apache.ctakes.core.util.DocumentIDAnnotationUtil;
import org.apache.ctakes.typesystem.type.syntax.TerminalTreebankNode;
import org.apache.ctakes.typesystem.type.syntax.TopTreebankNode;
import org.apache.ctakes.typesystem.type.syntax.TreebankNode;
import org.apache.ctakes.typesystem.type.textspan.Sentence;
import org.apache.log4j.Logger;
import org.apache.uima.analysis_engine.AnalysisEngineProcessException;
import org.apache.uima.cas.FSIterator;
import org.apache.uima.jcas.JCas;
import org.apache.uima.jcas.cas.FSArray;
import org.apache.uima.jcas.cas.StringArray;

public class MaxentParserWrapper implements ParserWrapper {

  Parser parser = null;
  private String parseStr = "";
  Logger logger = Logger.getLogger(this.getClass().getName());
  private boolean usePos;
 
  public MaxentParserWrapper(String dataDir) {
    this(dataDir, false);
  }
 
  public MaxentParserWrapper(String dataDir, boolean usePos){
    try {
      File d = new File(dataDir);
      this.usePos = usePos;
      if (!d.isDirectory()) {
        FileInputStream fis = new FileInputStream(d);
        ParserModel model = new ParserModel(fis);
        parser = new Parser(model, AbstractBottomUpParser.defaultBeamSize, AbstractBottomUpParser.defaultAdvancePercentage);
      }
    } catch (IOException e) {
      e.printStackTrace();
    }
  }

  @Override
  public String getParseString(FSIterator tokens) {
    return parseStr;
  }

  /*
   *  (non-Javadoc)
   * @see org.chboston.cnlp.ctakes.parser.ParserWrapper#createAnnotations(org.apache.uima.jcas.JCas)
   * FIXME - Does not handle the case where a sentence is only numbers. This can happen at the end of a note
   * after "real" sentences are done where a line is just a string of numbers (looks like a ZIP code).
   * For some reason the built-in tokenizer does not like that.
   */
  @Override
  public void createAnnotations(JCas jcas) throws AnalysisEngineProcessException {
    String docId = DocumentIDAnnotationUtil.getDocumentID(jcas);
    logger.info("Started processing: " + docId);
    // iterate over sentences
    FSIterator iterator = jcas.getAnnotationIndex(Sentence.type).iterator();
    Parse parse = null;
   
    while(iterator.hasNext()){
      Sentence sentAnnot = (Sentence) iterator.next();
      if(sentAnnot.getCoveredText().length() == 0){
        continue;
      }
      FSArray termArray = TreeUtils.getTerminals(jcas, sentAnnot);
      Parse inputTokens = TreeUtils.ctakesTokensToOpennlpTokens(sentAnnot, termArray);
      String sentStr = TreeUtils.getSentence(termArray);
      if(sentStr.length() == 0){
        parse = null;
      }else{
        parse = parser.parse(inputTokens);
      }
      TopTreebankNode top = TreeUtils.buildAlignedTree(jcas, parse, sentAnnot);
      top.addToIndexes();
    }
    logger.info("Done parsing: " + docId);
  }


}
TOP

Related Classes of org.apache.ctakes.constituency.parser.MaxentParserWrapper

TOP
Copyright © 2018 www.massapi.com. All rights reserved.
All source code are property of their respective owners. Java is a trademark of Sun Microsystems, Inc and owned by ORACLE Inc. Contact coftware#gmail.com.