Source Code of org.apache.ctakes.coreference.util.AttributeCalculator

/**
 * Licensed to the Apache Software Foundation (ASF) under one
 * or more contributor license agreements.  See the NOTICE file
 * distributed with this work for additional information
 * regarding copyright ownership.  The ASF licenses this file
 * to you under the Apache License, Version 2.0 (the
 * "License"); you may not use this file except in compliance
 * with the License.  You may obtain a copy of the License at
 *
 *   http://www.apache.org/licenses/LICENSE-2.0
 *
 * Unless required by applicable law or agreed to in writing,
 * software distributed under the License is distributed on an
 * "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
 * KIND, either express or implied.  See the License for the
 * specific language governing permissions and limitations
 * under the License.
 */
package org.apache.ctakes.coreference.util;


import java.util.ArrayList;
import java.util.HashSet;
import java.util.Hashtable;
import java.util.LinkedList;


import org.apache.ctakes.coreference.type.Markable;
import org.apache.ctakes.typesystem.type.syntax.BaseToken;
import org.apache.ctakes.typesystem.type.syntax.TerminalTreebankNode;
import org.apache.ctakes.typesystem.type.syntax.TreebankNode;
import org.apache.uima.cas.FSIterator;
import org.apache.uima.jcas.JCas;
import org.apache.uima.jcas.tcas.Annotation;
//import org.apache.ctakes.typesystem.type.NamedEntity;


public class AttributeCalculator {


  static HashSet<String> stopwords;
  JCas jcas;


  Hashtable<Integer, BaseToken> hbs;
  Hashtable<Integer, BaseToken> hbe;
//  Hashtable<Integer, NamedEntity> hns;
//  Hashtable<Integer, NamedEntity> hne;


  public AttributeCalculator (JCas jcas) {
    this.jcas = jcas;


    // index the base tokens and NEs by their offsets
    hbs = new Hashtable<Integer, BaseToken>();
    hbe = new Hashtable<Integer, BaseToken>();
    FSIterator iter = jcas.getJFSIndexRepository().getAnnotationIndex(BaseToken.type).iterator();
    while (iter.hasNext()) {
      BaseToken t = (BaseToken) iter.next();
      hbs.put(t.getBegin(), t);
      hbe.put(t.getEnd(), t);
    }
  }


  public void setStopWordsList (HashSet<String> l) {
    stopwords = l;
  }


  static boolean isPronoun (Markable m) {
    if (m.getContent() instanceof BaseToken) {
      BaseToken t = (BaseToken) m.getContent();
      if (t.getPartOfSpeech().startsWith("PRP")) // TODO: since only 3rd person pronouns are added as markables, no need to check
        return true;
    }
    return false;
  }


  static boolean isDemonstrative (String s) {
    if (s.startsWith("this") ||
        s.startsWith("that") ||
        s.startsWith("these") ||
        s.startsWith("those"))
        return true;
    else return false;
  }


  static boolean isPronominal (Markable m) {
    return (isDemonstrative(m.getCoveredText()) || isPronoun(m));
  }


  static boolean isDefinite (String s) {
    return s.toLowerCase().startsWith("the ");
  }


  public String number (Markable m){
    TreebankNode node = MarkableTreeUtils.markableNode(jcas, m.getBegin(), m.getEnd());
    if(node == null) return basicNumber(m);
    try{
      TerminalTreebankNode termNode = MarkableTreeUtils.getHead(node);
      String pos = termNode.getNodeType();
      if(pos.equals("NN") || pos.equals("NNP")) return "S";
      else if(pos.equals("NNS") || pos.equals("NNPS")) return "P";
      else{
        // obviously there are many other pronouns but we don't cover personal pronouns and so 
        // these are all we need.
        String word = termNode.getCoveredText();
        if(word.equalsIgnoreCase("it")) return "S";
        else if(word.equalsIgnoreCase("its")) return "S";
        else if(word.equalsIgnoreCase("they")) return "P";
        else if(word.equalsIgnoreCase("their")) return "P";
        else return "U";
      }
    }catch(NullPointerException e){
      return basicNumber(m);
    }
  }
  
  // FIXME, looks at the POS of the first NN type in the Markable, should look at the head
  // (Being fixed above in newNumber)
  String basicNumber (Markable m) {
    ArrayList<BaseToken> l = containedTokens(m.getContent().getBegin(), m.getContent().getEnd());
    for (BaseToken t : l) {
      String pos = t.getPartOfSpeech();
      if (pos.equals("NN") || pos.equals("NNP"))
        return "S";
      else if (pos.equals("NNS") || pos.equals("NNPS"))
        return "P";
//      else if (pos.equals("PRP")) {
//        if (m.getCoveredText().equalsIgnoreCase("we") || m.getCoveredText().equalsIgnoreCase("they"))
//          return "P";
//        else
//          return "S";
//      }
    }
    return "U";
  }


  public ArrayList<BaseToken> containedTokens (int a, int b) {
    ArrayList<BaseToken> ret = new ArrayList<BaseToken>();
    BaseToken t1 = hbs.get(a);
    BaseToken t2 = hbe.get(b);
    if (t1!=null && t2!=null) {
      int begin = t1.getTokenNumber();
      int end = t2.getTokenNumber();
      LinkedList<Annotation> l = FSIteratorToList.convert(jcas.getJFSIndexRepository().getAnnotationIndex(BaseToken.type).iterator());
      for (int i = 0; i < l.size(); i++) {
        BaseToken t = (BaseToken) l.get(i);
        if (t.getTokenNumber()>=begin && t.getTokenNumber()<=end)
          ret.add(t);
      }
    }
//    int e;
//    while (t!=null && (e=t.getEnd())<=b) {
//      ret.add(t);
//      t = hbs.get(e);
//    }
    return ret;
  }
  
  /*
  public TreebankNode markableNode(int a, int b){
    TreebankNode lowestDom = null;
    int overage = Integer.MAX_VALUE;
    FSIterator<Annotation> iter = jcas.getJFSIndexRepository().getAnnotationIndex(TreebankNode.type).iterator();
    while(iter.hasNext()){
      TreebankNode node = (TreebankNode) iter.next();
      if(node.getBegin() == a && node.getEnd() == b){
        // this code will drill down -- actually want to go other way
//        while(node.getChildren() != null && node.getChildren().size() == 1){
//          node = node.getChildren(0);
//        }
        
        // this code will head up as long as parent has the same span
        try{
          while(node.getParent() != null && node.getParent().getChildren().size() == 1){
            node = node.getParent();
          }
        }catch(NullPointerException e){
          System.err.println("Null pointer exception in AttributeCalculator::markableNode()");
        }
        return node;
      }else if(node.getBegin() <= a && node.getEnd() >= b){
        int tempOver = (a-node.getBegin()) + (node.getEnd()-b);
        if(tempOver < overage){
          lowestDom = node;
          overage = tempOver;
        }
      }
    }
    // There are lots of reasons to get this far -- error in the parse, personal pronoun in an NP (not annotated so not a markable),
    // unrecognized NML structure, etc.
    // Some other work will add any such nodes to the parse tree as in haghighi klein 09 (simple synt sem ...)
    // In contrast, we don't add node to the parse tree, just find the lowest node dominating the markable range
    // TODO test this
    return lowestDom;
//    return null;
  }
*/
  
  public ArrayList<String> contentWords (int begin, int end) {
    ArrayList<String> ret = new ArrayList<String>();
    ArrayList<BaseToken> l = containedTokens(begin, end);
    for (BaseToken t : l) {
      String s = t.getCoveredText().toLowerCase();
      if (!stopwords.contains(s))
        ret.add(s);
    }
    return ret;
  }


  ArrayList<String> contentWords (Markable m) {
    return contentWords(m.getBegin(), m.getEnd());
  }


}
Source Code of org.apache.ctakes.coreference.util.AttributeCalculator

Related Classes of org.apache.ctakes.coreference.util.AttributeCalculator