Source Code of org.apache.ctakes.coreference.ae.MarkableProb

/**
 * Licensed to the Apache Software Foundation (ASF) under one
 * or more contributor license agreements.  See the NOTICE file
 * distributed with this work for additional information
 * regarding copyright ownership.  The ASF licenses this file
 * to you under the Apache License, Version 2.0 (the
 * "License"); you may not use this file except in compliance
 * with the License.  You may obtain a copy of the License at
 *
 *   http://www.apache.org/licenses/LICENSE-2.0
 *
 * Unless required by applicable law or agreed to in writing,
 * software distributed under the License is distributed on an
 * "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
 * KIND, either express or implied.  See the License for the
 * specific language governing permissions and limitations
 * under the License.
 */
package org.apache.ctakes.coreference.ae;


import java.util.ArrayList;
import java.util.HashMap;
import java.util.HashSet;
import java.util.LinkedList;
import java.util.List;
import java.util.Map;
import java.util.Scanner;


import libsvm.svm;
import libsvm.svm_model;
import libsvm.svm_node;


import org.apache.ctakes.core.resource.FileResource;
import org.apache.ctakes.core.util.DocumentIDAnnotationUtil;
import org.apache.ctakes.coreference.type.BooleanLabeledFS;
import org.apache.ctakes.coreference.type.DemMarkable;
import org.apache.ctakes.coreference.type.Markable;
import org.apache.ctakes.coreference.type.MarkablePairSet;
import org.apache.ctakes.coreference.type.NEMarkable;
import org.apache.ctakes.coreference.type.PronounMarkable;
import org.apache.ctakes.coreference.util.AbstractClassifier;
import org.apache.ctakes.coreference.util.CorefConsts;
import org.apache.ctakes.coreference.util.FSIteratorToList;
import org.apache.ctakes.coreference.util.FeatureVector;
import org.apache.ctakes.coreference.util.MarkableTreeUtils;
import org.apache.ctakes.coreference.util.SvmVectorCreator;
import org.apache.ctakes.coreference.util.SyntaxAttributeCalculator;
import org.apache.ctakes.typesystem.type.relation.CollectionTextRelation;
import org.apache.ctakes.typesystem.type.relation.CoreferenceRelation;
import org.apache.ctakes.typesystem.type.relation.RelationArgument;
import org.apache.ctakes.typesystem.type.syntax.TreebankNode;
import org.apache.ctakes.typesystem.type.textsem.IdentifiedAnnotation;
import org.apache.log4j.Logger;
import org.apache.uima.UimaContext;
import org.apache.uima.analysis_component.JCasAnnotator_ImplBase;
import org.apache.uima.analysis_engine.AnalysisEngineProcessException;
import org.apache.uima.cas.FSIterator;
import org.apache.uima.jcas.JCas;
import org.apache.uima.jcas.cas.EmptyFSList;
import org.apache.uima.jcas.cas.FSList;
import org.apache.uima.jcas.cas.NonEmptyFSList;
import org.apache.uima.jcas.tcas.Annotation;
import org.apache.uima.resource.ResourceInitializationException;


public class MipacqSvmChainCreator extends JCasAnnotator_ImplBase {


  // LOG4J logger based on class name
  private Logger logger = Logger.getLogger(getClass().getName());


  // debug
  private boolean debug = false;


  // svm models
//  private AbstractClassifier mod_pron, mod_dem, mod_coref;
  private AbstractClassifier mod_coref;
//  private AbstractClassifier mod_pron;
//  private svm_model mod_anaphoricity;
  // positive class label index in libsvm's prob_est array
//  private int anaphoricity_idx
//  private int coref_idx;
  private SvmVectorCreator vecCreator = null;
  
//  ParentPtrTree ppt;


  HashSet<String> stopwords;
  private ArrayList<String> treeFrags;


/*  private svm_model loadModel (UimaContext uc, String m) {
    svm_model ret = null;
    try {
      String r = ((FileResource) uc.getResourceObject(m)).getFile().getAbsolutePath();
      ret = svm.svm_load_model(r);
      logger.info(m+" loaded: "+r);
    } catch (Exception e) {
      e.printStackTrace();
      logger.error("Error loading "+m);
    }
    return ret;
  }
*/
  @Override
  public void initialize(UimaContext uc) throws ResourceInitializationException {
    super.initialize(uc);


    // Load svm models
//    mod_anaphoricity = loadModel(uc, "svmAnaphoricityModel");
    // FIXME why is there a minus one here?
//    mod_pron = new AbstractClassifier(uc, "svmPronModel", FeatureVector.getPronCorefFeatures().length + SyntaxAttributeCalculator.getNumPronFeats() - 1);
//    mod_dem = new AbstractClassifier(uc, "svmDemModel", FeatureVector.getDemCorefFeatures().length + SyntaxAttributeCalculator.getNumDemFeats() - 1);
    mod_coref = new AbstractClassifier(uc, "svmCorefModel", FeatureVector.getNECorefFeatures().length + SyntaxAttributeCalculator.getNumNEFeats() - 1);


//    int[] labels = new int[2];
//    svm.svm_get_labels(mod_anaphoricity, labels);
//    anaphoricity_idx = labels[0]==1 ? 0 : 1;
    //    svm.svm_get_labels(mod_coref, labels);
    //    coref_idx = labels[0]==1 ? 0 : 1;


    // Load stop words list
    try {
      stopwords = new HashSet<String>();
      FileResource r = (FileResource) uc.getResourceObject("stopWords");
      Scanner scanner = new Scanner(r.getFile());
      String l;
      while (scanner.hasNextLine()) {
        l = scanner.nextLine().trim();
        if (l.length()==0) continue;
        int i = l.indexOf('|');
        if (i > 0)
          stopwords.add(l.substring(0,i).trim());
        else if (i < 0)
          stopwords.add(l.trim());
      }
      logger.info("Stop words list loaded: " + r.getFile().getAbsolutePath());
      vecCreator = new SvmVectorCreator(stopwords);


      treeFrags = new ArrayList<String>();
      r = (FileResource) uc.getResourceObject("frags");
      if(r != null){
        scanner = new Scanner(r.getFile());
        while(scanner.hasNextLine()){
          String line = scanner.nextLine();
          treeFrags.add(line.split(" ")[1]);
        }
        vecCreator.setFrags(treeFrags);
      }
      logger.info("Tree fragment features loaded: " + r.getFile().getAbsolutePath());
    } catch (Exception e) {
      e.printStackTrace();
      logger.error("Error loading stop words list");
    }
  }


  @Override
  public void process(JCas jcas) throws AnalysisEngineProcessException {
    // Convert the orderless FSIterator to List, sort by char offsets
    LinkedList<Annotation> lm = FSIteratorToList.convert(
        jcas.getJFSIndexRepository().getAnnotationIndex(Markable.type).iterator());
    Map<Markable, NonEmptyFSList> collectionRas = new HashMap<Markable, NonEmptyFSList>();
    String docName = DocumentIDAnnotationUtil.getDocumentID(jcas);
    logger.info("Classifying coreference in document: " + docName);
//    ArrayList<CollectionTextRelation> chains = new ArrayList<CollectionTextRelation>();
    int chainId = 0;
    
    EmptyFSList emptyList = new EmptyFSList(jcas);
    // Create a parent pointer tree to calculate equivalence classes
//    ppt = new ParentPtrTree(lm.size());


    // Make a data structure mapping markables to indexes so we don't lose the order if we re-arrange
    Map<Markable, Integer> m2q = new HashMap<Markable,Integer>();
    
    for(int p = 0; p < lm.size(); p++){
      m2q.put((Markable)lm.get(p), p);
    }
      
    FSIterator<Annotation> iter = jcas.getAnnotationIndex(MarkablePairSet.type).iterator();
    while(iter.hasNext()){
      MarkablePairSet set = (MarkablePairSet) iter.next();
      Markable anaphor = set.getAnaphor();
      FSList fs = (FSList) set.getAntecedentList();
      MarkableProb bestAnte = null;
      LinkedList<Markable> ll = fs2ll(fs);
      if(anaphor instanceof PronounMarkable){
        // There is not enough training data to do this reliably... the
        // classifier for this type will decrease scores
//        bestAnte = processPronoun(anaphor, ll, jcas);
        bestAnte = new MarkableProb(null, 0.0);
      }else if(anaphor instanceof NEMarkable){
        bestAnte = processNE(anaphor, ll, jcas);
      }else if(anaphor instanceof DemMarkable){
        bestAnte = processDem(anaphor, ll, jcas);
      }else{
        // should not happenn...
        continue;
      }


      if(bestAnte.prob > CorefConsts.COREF_THRESHOLD){
        // create the coref relation type
        CoreferenceRelation cr = new CoreferenceRelation(jcas);
        cr.setCategory("Coreference");
        RelationArgument ra1 = new RelationArgument(jcas);
        ra1.setId(bestAnte.m.getId());
        ra1.setArgument(bestAnte.m.getContent());
        ra1.setRole("antecedent");
        RelationArgument ra2 = new RelationArgument(jcas);
        ra2.setId(anaphor.getId());
        ra2.setArgument(anaphor.getContent());
        ra2.setRole("anaphor");
        cr.setArg1(ra1);
        cr.setArg2(ra2);
        cr.setConfidence(bestAnte.prob);
        ra1.addToIndexes();
        ra2.addToIndexes();
        cr.addToIndexes();
        
        // propagate the collection relation type
        RelationArgument anaRa = new RelationArgument(jcas);
        anaRa.setId(anaphor.getId());
        anaRa.setArgument(anaphor.getContent());
        anaRa.setRole("mention");
        NonEmptyFSList node = new NonEmptyFSList(jcas);
        node.setHead(anaRa);
        node.setTail(emptyList);
        collectionRas.put(anaphor, node);
        NonEmptyFSList anteNode = null;
        if(collectionRas.containsKey(bestAnte.m)){
          anteNode = collectionRas.get(bestAnte.m);
          // find the end of the chain of this node
          while(anteNode.getTail() instanceof NonEmptyFSList){
            anteNode = (NonEmptyFSList) anteNode.getTail();
          }
        }else{
          RelationArgument anteRa = new RelationArgument(jcas);
          anteRa.setId(bestAnte.m.getId());
          anteRa.setArgument(bestAnte.m.getContent());
          anteRa.setRole("mention");
          
          anteNode = new NonEmptyFSList(jcas);
          anteNode.setHead(anteRa);
          collectionRas.put(bestAnte.m, anteNode);
          CollectionTextRelation chain = new CollectionTextRelation(jcas);
          chain.setId(chainId++);
          chain.setCategory("CoreferenceChain");
          chain.setMembers(anteNode);
          chain.addToIndexes();
        }
        anteNode.setTail(node);
        
        
//        ppt.union(m2q.get(anaphor), m2q.get(bestAnte.m));
        if(anaphor instanceof PronounMarkable){
          // if the anaphor is a pronoun then it won't be in the cas as an identifiedannotation so we need to add it.
          IdentifiedAnnotation ia = new IdentifiedAnnotation(jcas);
          // TODO
        }
      }else{
//        indexNegativeExample(jcas, bestAnte.m, anaphor, bestAnte.prob);
      }
    }
    logger.info("Done classifying document: " + docName);


//    // Extract equivalence classes and save them into CAS
//    int[] ec = new int[ppt.getSize()]; // class number for each Markable
//    int n = ppt.equivCls(ec); // n holds the number of classes
//    EmptyFSList elist = new EmptyFSList(jcas); // shared tail for all chains
//    FSList[] listhds = new FSList[n]; // keep track of the heads of all chains
//    CollectionTextRelation[] chains = new CollectionTextRelation[n];


//    // Initialize n chains
//    for (int i = 0; i < n; ++i) {
//      chains[i] = null; //new CollectionTextRelation(jcas);
////      chains[i].setId(i);
////      chains[i].setCategory("CoreferenceChain");
////      chains[i].addToIndexes();
//      listhds[i] = elist;
//    }


//    // Scan from the end of the Markable list
//    // insert Markables to the head of their chains
//    for (int i = ec.length-1; i >= 0; --i) {
//      if(m2ra.containsKey(lm.get(i))){
//        NonEmptyFSList l = new NonEmptyFSList(jcas);
//        l.setHead(m2ra.get(lm.get(i)));
//        l.setTail(listhds[ec[i]]);
//        listhds[ec[i]] = l;
//        if(chains[ec[i]] == null){
//          chains[ec[i]] = new CollectionTextRelation(jcas);
//        }
//        chains[ec[i]].setMembers(l);
//      }
//    }
    
//    int j = 0;
//    for(int i = 0; i < n; i++){
//      if(chains[i] != null){
//        chains[i].setId(j++);
//        chains[i].setCategory("CoreferenceChain");
//        chains[i].addToIndexes();
//      }
//    }
  }




  private LinkedList<Markable> fs2ll(FSList fs) {
    LinkedList<Markable> ll = new LinkedList<Markable>();
    while(fs instanceof NonEmptyFSList){
      NonEmptyFSList node = (NonEmptyFSList) fs;
      BooleanLabeledFS feat = (BooleanLabeledFS) node.getHead();
      Markable antecedent = (Markable) feat.getFeature();
      ll.add(antecedent);
      fs = node.getTail();
    }
    return ll;
  }
  
  private MarkableProb processPronoun(Markable anaphor, LinkedList<Markable> anteList, JCas jcas){
    Markable ante = null;
    double bestProb = 0.0;
    List<Markable> resortedList = anteList;
    for(Markable antecedent : resortedList){
      svm_node[] nodes = vecCreator.getNodeFeatures(anaphor, antecedent, jcas);
      
      double prob = 0.0;
      prob = mod_coref.predict(nodes);
      if(prob > bestProb){
//        if(debug) indexNegativeExample(jcas, ante, anaphor, bestProb); // save former best as non-ante...
        bestProb = prob;
        ante = antecedent;
      }else{
//        if(debug){
//          indexNegativeExample(jcas, antecedent, anaphor, prob);
//        }
      }
      if(bestProb > 0.5) break;
    }
    return new MarkableProb(ante, bestProb);
  }


  private MarkableProb processNE(Markable anaphor, List<Markable> anteList, JCas jcas){
    Markable ante = null;
    double bestProb = 0.0;
    for(Markable antecedent : anteList){
      svm_node[] nodes = vecCreator.getNodeFeatures(anaphor, antecedent, jcas, true);
      double prob = 0.0;
      prob = mod_coref.predict(nodes);
      if(prob > bestProb){
//        if(debug) indexNegativeExample(jcas, ante, anaphor, bestProb);
        bestProb = prob;
        ante = antecedent;
      }else{
//        if(debug) indexNegativeExample(jcas, antecedent, anaphor, prob);
      }
    }
    return new MarkableProb(ante, bestProb);
  }
  
  private MarkableProb processNELazily(Markable anaphor, List<Markable> anteList, JCas jcas){
    if(anteList.size() > 0) return new MarkableProb(anteList.get(0), 1.0);
    else return new MarkableProb(null,0.0);
  }
  
  private MarkableProb processDem(Markable anaphor, List<Markable> anteList, JCas jcas){
    double bestProb = 0.0;
    TreebankNode n = MarkableTreeUtils.markableNode(jcas, anaphor.getBegin(), anaphor.getEnd());
    TreebankNode parent = (n != null ? n.getParent() : null);
    TreebankNode gparent = (parent != null ? parent.getParent() : null);
    Markable ante = null;
    for(Markable antecedent: anteList){
      if(n!=null && parent != null && gparent != null && n.getNodeType().equals("WHNP") && parent.getNodeType().equals("SBAR")
          && gparent.getNodeType().equals("NP") && gparent.getChildren(1) == parent && gparent.getChildren(0).getNodeType().equals("NP")){
        TreebankNode anteNode = gparent.getChildren(0);
        Markable trueAnte = MarkableTreeUtils.nodeMarkable(jcas, anteNode.getBegin(), anteNode.getEnd());
        if(trueAnte == antecedent){
          bestProb = 1.0;
          ante = antecedent;
          break;
        }
      }
    }
    return new MarkableProb(ante, bestProb);
  }
  




//  private void indexNegativeExample(JCas jcas, Markable ante, Markable ana,
//      double d) {
//    if(ante == null) return;
//    // README - If needed for debugging needs to be reimplemented now that type system has changed...
//    UncoreferentRelation rel = new UncoreferentRelation(jcas);
//    RelationArgument arg1 = new RelationArgument(jcas);
//    arg1.setArgument(ante);
//    arg1.setRole("nonantecedent");
//    RelationArgument arg2 = new RelationArgument(jcas);
//    arg2.setArgument(ana);
//    arg2.setRole("nonanaphor");
//    rel.setArg1(arg1);
//    rel.setArg2(arg2);
//    rel.setConfidence(d);
//    rel.addToIndexes();
//  }
}


class MarkableProb{
  public MarkableProb(Markable m, Double p){
    this.m = m;
    this.prob = p;
  }
  Markable m;
  Double prob;
}
Source Code of org.apache.ctakes.coreference.ae.MarkableProb

Related Classes of org.apache.ctakes.coreference.ae.MarkableProb