Package org.apache.ctakes.coreference.util

Source Code of org.apache.ctakes.coreference.util.AttributeCalculator

/**
* Licensed to the Apache Software Foundation (ASF) under one
* or more contributor license agreements.  See the NOTICE file
* distributed with this work for additional information
* regarding copyright ownership.  The ASF licenses this file
* to you under the Apache License, Version 2.0 (the
* "License"); you may not use this file except in compliance
* with the License.  You may obtain a copy of the License at
*
*   http://www.apache.org/licenses/LICENSE-2.0
*
* Unless required by applicable law or agreed to in writing,
* software distributed under the License is distributed on an
* "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
* KIND, either express or implied.  See the License for the
* specific language governing permissions and limitations
* under the License.
*/
package org.apache.ctakes.coreference.util;

import java.util.ArrayList;
import java.util.HashSet;
import java.util.Hashtable;
import java.util.LinkedList;

import org.apache.ctakes.coreference.type.Markable;
import org.apache.ctakes.typesystem.type.syntax.BaseToken;
import org.apache.ctakes.typesystem.type.syntax.TerminalTreebankNode;
import org.apache.ctakes.typesystem.type.syntax.TreebankNode;
import org.apache.uima.cas.FSIterator;
import org.apache.uima.jcas.JCas;
import org.apache.uima.jcas.tcas.Annotation;
//import org.apache.ctakes.typesystem.type.NamedEntity;

public class AttributeCalculator {

  static HashSet<String> stopwords;
  JCas jcas;

  Hashtable<Integer, BaseToken> hbs;
  Hashtable<Integer, BaseToken> hbe;
//  Hashtable<Integer, NamedEntity> hns;
//  Hashtable<Integer, NamedEntity> hne;

  public AttributeCalculator (JCas jcas) {
    this.jcas = jcas;

    // index the base tokens and NEs by their offsets
    hbs = new Hashtable<Integer, BaseToken>();
    hbe = new Hashtable<Integer, BaseToken>();
    FSIterator iter = jcas.getJFSIndexRepository().getAnnotationIndex(BaseToken.type).iterator();
    while (iter.hasNext()) {
      BaseToken t = (BaseToken) iter.next();
      hbs.put(t.getBegin(), t);
      hbe.put(t.getEnd(), t);
    }
  }

  public void setStopWordsList (HashSet<String> l) {
    stopwords = l;
  }

  static boolean isPronoun (Markable m) {
    if (m.getContent() instanceof BaseToken) {
      BaseToken t = (BaseToken) m.getContent();
      if (t.getPartOfSpeech().startsWith("PRP")) // TODO: since only 3rd person pronouns are added as markables, no need to check
        return true;
    }
    return false;
  }

  static boolean isDemonstrative (String s) {
    if (s.startsWith("this") ||
        s.startsWith("that") ||
        s.startsWith("these") ||
        s.startsWith("those"))
        return true;
    else return false;
  }

  static boolean isPronominal (Markable m) {
    return (isDemonstrative(m.getCoveredText()) || isPronoun(m));
  }

  static boolean isDefinite (String s) {
    return s.toLowerCase().startsWith("the ");
  }

  public String number (Markable m){
    TreebankNode node = MarkableTreeUtils.markableNode(jcas, m.getBegin(), m.getEnd());
    if(node == null) return basicNumber(m);
    try{
      TerminalTreebankNode termNode = MarkableTreeUtils.getHead(node);
      String pos = termNode.getNodeType();
      if(pos.equals("NN") || pos.equals("NNP")) return "S";
      else if(pos.equals("NNS") || pos.equals("NNPS")) return "P";
      else{
        // obviously there are many other pronouns but we don't cover personal pronouns and so
        // these are all we need.
        String word = termNode.getCoveredText();
        if(word.equalsIgnoreCase("it")) return "S";
        else if(word.equalsIgnoreCase("its")) return "S";
        else if(word.equalsIgnoreCase("they")) return "P";
        else if(word.equalsIgnoreCase("their")) return "P";
        else return "U";
      }
    }catch(NullPointerException e){
      return basicNumber(m);
    }
  }
 
  // FIXME, looks at the POS of the first NN type in the Markable, should look at the head
  // (Being fixed above in newNumber)
  String basicNumber (Markable m) {
    ArrayList<BaseToken> l = containedTokens(m.getContent().getBegin(), m.getContent().getEnd());
    for (BaseToken t : l) {
      String pos = t.getPartOfSpeech();
      if (pos.equals("NN") || pos.equals("NNP"))
        return "S";
      else if (pos.equals("NNS") || pos.equals("NNPS"))
        return "P";
//      else if (pos.equals("PRP")) {
//        if (m.getCoveredText().equalsIgnoreCase("we") || m.getCoveredText().equalsIgnoreCase("they"))
//          return "P";
//        else
//          return "S";
//      }
    }
    return "U";
  }

  public ArrayList<BaseToken> containedTokens (int a, int b) {
    ArrayList<BaseToken> ret = new ArrayList<BaseToken>();
    BaseToken t1 = hbs.get(a);
    BaseToken t2 = hbe.get(b);
    if (t1!=null && t2!=null) {
      int begin = t1.getTokenNumber();
      int end = t2.getTokenNumber();
      LinkedList<Annotation> l = FSIteratorToList.convert(jcas.getJFSIndexRepository().getAnnotationIndex(BaseToken.type).iterator());
      for (int i = 0; i < l.size(); i++) {
        BaseToken t = (BaseToken) l.get(i);
        if (t.getTokenNumber()>=begin && t.getTokenNumber()<=end)
          ret.add(t);
      }
    }
//    int e;
//    while (t!=null && (e=t.getEnd())<=b) {
//      ret.add(t);
//      t = hbs.get(e);
//    }
    return ret;
  }
 
  /*
  public TreebankNode markableNode(int a, int b){
    TreebankNode lowestDom = null;
    int overage = Integer.MAX_VALUE;
    FSIterator<Annotation> iter = jcas.getJFSIndexRepository().getAnnotationIndex(TreebankNode.type).iterator();
    while(iter.hasNext()){
      TreebankNode node = (TreebankNode) iter.next();
      if(node.getBegin() == a && node.getEnd() == b){
        // this code will drill down -- actually want to go other way
//        while(node.getChildren() != null && node.getChildren().size() == 1){
//          node = node.getChildren(0);
//        }
       
        // this code will head up as long as parent has the same span
        try{
          while(node.getParent() != null && node.getParent().getChildren().size() == 1){
            node = node.getParent();
          }
        }catch(NullPointerException e){
          System.err.println("Null pointer exception in AttributeCalculator::markableNode()");
        }
        return node;
      }else if(node.getBegin() <= a && node.getEnd() >= b){
        int tempOver = (a-node.getBegin()) + (node.getEnd()-b);
        if(tempOver < overage){
          lowestDom = node;
          overage = tempOver;
        }
      }
    }
    // There are lots of reasons to get this far -- error in the parse, personal pronoun in an NP (not annotated so not a markable),
    // unrecognized NML structure, etc.
    // Some other work will add any such nodes to the parse tree as in haghighi klein 09 (simple synt sem ...)
    // In contrast, we don't add node to the parse tree, just find the lowest node dominating the markable range
    // TODO test this
    return lowestDom;
//    return null;
  }
*/
 
  public ArrayList<String> contentWords (int begin, int end) {
    ArrayList<String> ret = new ArrayList<String>();
    ArrayList<BaseToken> l = containedTokens(begin, end);
    for (BaseToken t : l) {
      String s = t.getCoveredText().toLowerCase();
      if (!stopwords.contains(s))
        ret.add(s);
    }
    return ret;
  }

  ArrayList<String> contentWords (Markable m) {
    return contentWords(m.getBegin(), m.getEnd());
  }

}
TOP

Related Classes of org.apache.ctakes.coreference.util.AttributeCalculator

TOP
Copyright © 2018 www.massapi.com. All rights reserved.
All source code are property of their respective owners. Java is a trademark of Sun Microsystems, Inc and owned by ORACLE Inc. Contact coftware#gmail.com.