Source Code of org.apache.ctakes.assertion.medfacts.cleartk.AssertionCleartkAnalysisEngine

/*
 * Licensed to the Apache Software Foundation (ASF) under one
 * or more contributor license agreements.  See the NOTICE file
 * distributed with this work for additional information
 * regarding copyright ownership.  The ASF licenses this file
 * to you under the Apache License, Version 2.0 (the
 * "License"); you may not use this file except in compliance
 * with the License.  You may obtain a copy of the License at
 *
 *   http://www.apache.org/licenses/LICENSE-2.0
 *
 * Unless required by applicable law or agreed to in writing,
 * software distributed under the License is distributed on an
 * "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
 * KIND, either express or implied.  See the License for the
 * specific language governing permissions and limitations
 * under the License.
 */
package org.apache.ctakes.assertion.medfacts.cleartk;


import java.net.URI;
import java.util.ArrayList;
import java.util.Collection;
import java.util.List;
import java.util.Map;
import java.util.Random;


import org.apache.ctakes.assertion.attributes.features.selection.FeatureSelection;
import org.apache.ctakes.assertion.zoner.types.Zone;
import org.apache.ctakes.typesystem.type.constants.CONST;
import org.apache.ctakes.typesystem.type.structured.DocumentID;
import org.apache.ctakes.typesystem.type.syntax.BaseToken;
import org.apache.ctakes.typesystem.type.temporary.assertion.AssertionCuePhraseAnnotation;
import org.apache.ctakes.typesystem.type.textsem.EntityMention;
import org.apache.ctakes.typesystem.type.textsem.EventMention;
import org.apache.ctakes.typesystem.type.textsem.IdentifiedAnnotation;
import org.apache.ctakes.typesystem.type.textspan.Sentence;
import org.apache.log4j.Logger;
import org.apache.uima.UimaContext;
import org.apache.uima.analysis_engine.AnalysisEngineDescription;
import org.apache.uima.analysis_engine.AnalysisEngineProcessException;
import org.apache.uima.cas.CASException;
import org.apache.uima.jcas.JCas;
import org.apache.uima.resource.ResourceInitializationException;
import org.cleartk.classifier.CleartkAnnotator;
import org.cleartk.classifier.Feature;
import org.cleartk.classifier.Instance;
import org.cleartk.classifier.feature.extractor.CleartkExtractor;
import org.cleartk.classifier.feature.extractor.simple.CombinedExtractor;
import org.cleartk.classifier.feature.extractor.simple.CoveredTextExtractor;
import org.cleartk.classifier.feature.extractor.simple.SimpleFeatureExtractor;
import org.cleartk.classifier.feature.extractor.simple.TypePathExtractor;
import org.uimafit.descriptor.ConfigurationParameter;
import org.uimafit.factory.AnalysisEngineFactory;
import org.uimafit.factory.ConfigurationParameterFactory;
import org.uimafit.util.JCasUtil;
//import org.chboston.cnlp.ctakes.relationextractor.ae.ModifierExtractorAnnotator;


public abstract class AssertionCleartkAnalysisEngine extends
    CleartkAnnotator<String>
{
  Logger logger = Logger.getLogger(AssertionCleartkAnalysisEngine.class);


  public static final String PARAM_GOLD_VIEW_NAME = "GoldViewName";
  
  public static int relationId; // counter for error logging


  @ConfigurationParameter(
      name = PARAM_GOLD_VIEW_NAME,
      mandatory = false,
      description = "view containing the manual identified annotations (especially EntityMention and EventMention annotations); needed for training")
  protected String goldViewName;


  public static final String PARAM_PRINT_ERRORS = "PrintErrors";
  
  @ConfigurationParameter(
     name = PARAM_PRINT_ERRORS,
     mandatory = false,
     description = "Print errors true/false",
     defaultValue = "false")
  boolean printErrors;
  
  public static final String PARAM_PROBABILITY_OF_KEEPING_DEFAULT_EXAMPLE = "ProbabilityOfKeepingADefaultExample";


  @ConfigurationParameter(
      name = PARAM_PROBABILITY_OF_KEEPING_DEFAULT_EXAMPLE,
      mandatory = false,
      description = "probability that a default example should be retained for training")
  protected double probabilityOfKeepingADefaultExample = 1.0;


  public static final String PARAM_FEATURE_SELECTION_THRESHOLD = "WhetherToDoFeatureSelection"; // Accurate name? Actually uses the threshold, right?


  @ConfigurationParameter(
      name = PARAM_FEATURE_SELECTION_THRESHOLD,
      mandatory = false,
      description = "the Chi-squared threshold at which features should be removed")
  protected Float featureSelectionThreshold = 0f;


  public static final String PARAM_FEATURE_SELECTION_URI = "FeatureSelectionURI";


  @ConfigurationParameter(
      mandatory = false,
      name = PARAM_FEATURE_SELECTION_URI,
      description = "provides a URI where the feature selection data will be written")
  protected URI featureSelectionURI;
  
  protected Random coin = new Random(0);


  protected static final String FEATURE_SELECTION_NAME = "SelectNeighborFeatures";


  protected String lastLabel;
  
  
/* DEPRECATED: STW 2013/03/28.  Use DependencyUtility:getNominalHeadNode(jCas,annotation) instead */
//  public ConllDependencyNode findAnnotationHead(JCas jcas, Annotation annotation) {
//    
//      for (ConllDependencyNode depNode : JCasUtil.selectCovered(jcas, ConllDependencyNode.class, annotation)) {
//        
//        ConllDependencyNode head = depNode.getHead();
//        if (head == null || head.getEnd() <= annotation.getBegin() || head.getBegin() > annotation.getEnd()) {
//          // The head is outside the bounds of the annotation, so this node must be the annotation's head
//          return depNode;
//        }
//      }
//      // Can this happen?
//      return null;
//  }


  
  
  
//private SimpleFeatureExtractor tokenFeatureExtractor;
//  protected List<ContextExtractor<IdentifiedAnnotation>> contextFeatureExtractors;
//  protected List<ContextExtractor<BaseToken>> tokenContextFeatureExtractors;
  protected List<CleartkExtractor> contextFeatureExtractors;
  protected List<CleartkExtractor> tokenContextFeatureExtractors;
  protected List<CleartkExtractor> tokenCleartkExtractors;
  protected List<SimpleFeatureExtractor> entityFeatureExtractors;
  protected CleartkExtractor cuePhraseInWindowExtractor;


  protected FeatureSelection<String> featureSelection;
  
  public abstract void setClassLabel(IdentifiedAnnotation entityMention, Instance<String> instance) throws AnalysisEngineProcessException;


  protected abstract void initializeFeatureSelection() throws ResourceInitializationException;
//  public abstract FeatureSelection<String> createFeatureSelection(double threshold);
//  public abstract URI createFeatureSelectionURI(File outputDirectoryName);


  @Override
  @SuppressWarnings("deprecation")
  public void initialize(UimaContext context) throws ResourceInitializationException {
    super.initialize(context);
    
    if (this.isTraining() && this.goldViewName == null) {
      throw new IllegalArgumentException(PARAM_GOLD_VIEW_NAME + " must be defined during training");
    }
    
    // alias for NGram feature parameters
//    int fromRight = CharacterNGramProliferator.RIGHT_TO_LEFT;


    // a list of feature extractors that require only the token:
    // the stem of the word, the text of the word itself, plus
    // features created from the word text like character ngrams
    this.entityFeatureExtractors = new ArrayList<SimpleFeatureExtractor>();
    
    // a list of feature extractors that require the token and the sentence
//    this.contextFeatureExtractors = new ArrayList<CleartkExtractor>();
    
    this.tokenCleartkExtractors = new ArrayList<CleartkExtractor>();


    CleartkExtractor tokenExtraction1 = 
        new CleartkExtractor(
            BaseToken.class, 
//            new FeatureFunctionExtractor(new CoveredTextExtractor(), new LowerCaseFeatureFunction()),
            new CoveredTextExtractor(),
            //new CleartkExtractor.Covered(),
            new CleartkExtractor.LastCovered(2),
            new CleartkExtractor.Preceding(5),
            new CleartkExtractor.Following(4),
            new CleartkExtractor.Bag(new CleartkExtractor.Preceding(3)),
            new CleartkExtractor.Bag(new CleartkExtractor.Following(3)),
            new CleartkExtractor.Bag(new CleartkExtractor.Preceding(5)),
            new CleartkExtractor.Bag(new CleartkExtractor.Following(5)),
            new CleartkExtractor.Bag(new CleartkExtractor.Preceding(10)),
            new CleartkExtractor.Bag(new CleartkExtractor.Following(10))
            );
    
//    CleartkExtractor posExtraction1 = 
//        new CleartkExtractor(
//            BaseToken.class,
//            new TypePathExtractor(BaseToken.class, "partOfSpeech"),
//            new CleartkExtractor.LastCovered(2),
//            new CleartkExtractor.Preceding(3),
//            new CleartkExtractor.Following(2)
//            );


    this.tokenCleartkExtractors.add(tokenExtraction1);
    //this.tokenCleartkExtractors.add(posExtraction1);
    
//    this.contextFeatureExtractors.add(new CleartkExtractor(IdentifiedAnnotation.class,
//        new CoveredTextExtractor(),
//        //new TypePathExtractor(IdentifiedAnnotation.class, "stem"),
//        new Preceding(2),
//        new Following(2)));
    
    // stab at dependency-based features
    //List<Feature> features = new ArrayList<Feature>();
    //ConllDependencyNode node1 = findAnnotationHead(jCas, arg1);


    CombinedExtractor baseExtractorCuePhraseCategory =
        new CombinedExtractor
          (
           new CoveredTextExtractor(),
           new TypePathExtractor(AssertionCuePhraseAnnotation.class, "cuePhrase"),
           new TypePathExtractor(AssertionCuePhraseAnnotation.class, "cuePhraseCategory"),
           new TypePathExtractor(AssertionCuePhraseAnnotation.class, "cuePhraseAssertionFamily")
          );
    
    cuePhraseInWindowExtractor = new CleartkExtractor(
        BaseToken.class,
        new CoveredTextExtractor(),
        new CleartkExtractor.Bag(new CleartkExtractor.Covered())
//          AssertionCuePhraseAnnotation.class,
//          baseExtractorCuePhraseCategory,
//          new CleartkExtractor.Bag(new CleartkExtractor.Preceding(3)),
//          new CleartkExtractor.Bag(new CleartkExtractor.Following(3)),
//          new CleartkExtractor.Bag(new CleartkExtractor.Preceding(5)),
//          new CleartkExtractor.Bag(new CleartkExtractor.Following(5)),
//          new CleartkExtractor.Bag(new CleartkExtractor.Preceding(10)),
//          new CleartkExtractor.Bag(new CleartkExtractor.Following(10))
          );
    
  }


  @Override
  public void process(JCas jCas) throws AnalysisEngineProcessException
  {
    DocumentID documentId = JCasUtil.selectSingle(jCas, DocumentID.class);
    if (documentId != null)
    {
      logger.debug("processing next doc: " + documentId.getDocumentID());
    } else
    {
      logger.warn("processing next doc (doc id is null)");
    }
    
    this.lastLabel = "<BEGIN>";
    
//    // get gold standard relation instances during testing for error analysis
//    if (! this.isTraining() && printErrors) {
//      JCas goldView;
//      try {
//        goldView = jCas.getView("GoldView");
//      } catch(CASException e) {
//        throw new AnalysisEngineProcessException(e);
//      }
//      
//      //categoryLookup = createCategoryLookup(goldView); 
//    }
    
    JCas identifiedAnnotationView;
    if (this.isTraining()) {
      try {
        identifiedAnnotationView = jCas.getView(this.goldViewName);
      } catch (CASException e) {
        throw new AnalysisEngineProcessException(e);
      }
    } else {
      identifiedAnnotationView = jCas;
    }




//    Map<IdentifiedAnnotation, Collection<Sentence>> coveringSentenceMap = JCasUtil.indexCovering(identifiedAnnotationView, IdentifiedAnnotation.class, Sentence.class);
//    Map<Sentence, Collection<BaseToken>> tokensCoveredInSentenceMap = JCasUtil.indexCovered(identifiedAnnotationView, Sentence.class, BaseToken.class);


    Map<IdentifiedAnnotation, Collection<Zone>> coveringZoneMap =
        JCasUtil.indexCovering(jCas, IdentifiedAnnotation.class, Zone.class);
//    Map<IdentifiedAnnotation, Collection<Sentence>> coveringSents =
//        JCasUtil.indexCovering(jCas, IdentifiedAnnotation.class, Sentence.class);
    
//    List<Instance<String>> instances = new ArrayList<Instance<String>>();
    // generate a list of training instances for each sentence in the document
    Collection<IdentifiedAnnotation> entities = JCasUtil.select(identifiedAnnotationView, IdentifiedAnnotation.class);
    for (IdentifiedAnnotation identifiedAnnotation : entities)
    {
      if (!(identifiedAnnotation instanceof EntityMention || identifiedAnnotation instanceof EventMention))
      {
        continue;
      }
      IdentifiedAnnotation entityOrEventMention = identifiedAnnotation;
      if (entityOrEventMention.getPolarity() == -1)
      {
        logger.debug(String.format(" - identified annotation: [%d-%d] polarity %d (%s)",
            entityOrEventMention.getBegin(),
            entityOrEventMention.getEnd(),
            entityOrEventMention.getPolarity(),
            entityOrEventMention.getClass().getName()));
      }
      Instance<String> instance = new Instance<String>();
      
//      // extract all features that require only the entity mention annotation
//      instance.addAll(tokenFeatureExtractor.extract(jCas, entityMention));


      // extract all features that require the token and sentence annotations


      //Sentence sentence = sentenceList.iterator().next();
      
      /*
      if (sentence != null)
      {
        for (ContextExtractor<IdentifiedAnnotation> extractor : this.contextFeatureExtractors) {
          instance.addAll(extractor.extractWithin(identifiedAnnotationView, entityMention, sentence));
        }
      } else
      {
        // TODO extract context features for annotations that don't fall within a sentence
        logger.log(Level.WARN, "FIXME/TODO: generate context features for entities that don't fall within a sentence");
      }
      */
      
      /*
      for (ContextExtractor<BaseToken> extractor : this.tokenContextFeatureExtractors) {
          instance.addAll(extractor.extract(identifiedAnnotationView, entityMention));
        }
        */
      for (CleartkExtractor extractor : this.tokenCleartkExtractors) {
          //instance.addAll(extractor.extractWithin(identifiedAnnotationView, entityMention, sentence));
        instance.addAll(extractor.extract(identifiedAnnotationView, entityOrEventMention));
        }
      
//      List<Feature> cuePhraseFeatures = null;
//          cuePhraseInWindowExtractor.extract(jCas, entityOrEventMention);
          //cuePhraseInWindowExtractor.extractWithin(jCas, entityMention, firstCoveringSentence);
//      List<Sentence> sents = new ArrayList<Sentence>(coveringSents.get(entityOrEventMention));
      List<Sentence> sents = new ArrayList<Sentence>(JCasUtil.selectCovering(jCas, Sentence.class, entityOrEventMention.getBegin(), entityOrEventMention.getEnd()));
      if(sents.size() > 0){
        Sentence sentence = sents.get(0);
        List<AssertionCuePhraseAnnotation> cues = JCasUtil.selectCovered(AssertionCuePhraseAnnotation.class, sentence);
        int closest = Integer.MAX_VALUE;
        AssertionCuePhraseAnnotation closestCue = null;
        for(AssertionCuePhraseAnnotation cue : cues){
          List<BaseToken> tokens = JCasUtil.selectBetween(BaseToken.class, cue, entityOrEventMention);
          if(tokens.size() < closest){
            closestCue = cue;
            closest = tokens.size();
          }
//          instance.addAll(cuePhraseInWindowExtractor.extractBetween(jCas, cue, entityOrEventMention));
        }
        if(closestCue != null && closest < 21){
          instance.add(new Feature("ClosestCue_Word", closestCue.getCoveredText()));
//          instance.add(new Feature("ClosestCue_Phrase", closestCue.getCuePhrase()));
          instance.add(new Feature("ClosestCue_PhraseFamily", closestCue.getCuePhraseAssertionFamily()));
          instance.add(new Feature("ClosestCue_PhraseCategory", closestCue.getCuePhraseCategory()));
        }
      }
//      if (cuePhraseFeatures != null && !cuePhraseFeatures.isEmpty())
//      {
//        instance.addAll(cuePhraseFeatures);
//      }




      // 7/9/13 SRH trying to make it work just for anatomical site
      int eemTypeId = entityOrEventMention.getTypeID(); 
      if (eemTypeId == CONST.NE_TYPE_ID_ANATOMICAL_SITE) {
          // 7/9/13 srh modified per tmiller so it's binary but not numeric feature
          //instance.add(new Feature("ENTITY_TYPE_" + entityOrEventMention.getTypeID()));
          instance.add(new Feature("ENTITY_TYPE_ANAT_SITE"));
      } /* This hurts recall more than it helps precision
      else if (eemTypeId == CONST.NE_TYPE_ID_DRUG) {
        // 7/10 adding drug
        instance.add(new Feature("ENTITY_TYPE_DRUG"));
      }
      */
      
      for (SimpleFeatureExtractor extractor : this.entityFeatureExtractors) {
        instance.addAll(extractor.extract(jCas, entityOrEventMention));
      }
      
      
      List<Feature> zoneFeatures = extractZoneFeatures(coveringZoneMap, entityOrEventMention);
      if (zoneFeatures != null && !zoneFeatures.isEmpty())
      {
//        instance.addAll(zoneFeatures);
      }
      
      List<Feature> feats = instance.getFeatures();
//      List<Feature> lcFeats = new ArrayList<Feature>();
      
      for(Feature feat : feats){
        if(feat.getName() != null && (feat.getName().startsWith("TreeFrag") || feat.getName().startsWith("WORD") || feat.getName().startsWith("NEG"))) continue;
        if(feat.getValue() instanceof String){
          feat.setValue(((String)feat.getValue()).toLowerCase());
        }
      }


      // grab the output label
      setClassLabel(entityOrEventMention, instance);


      if (this.isTraining()) {
        // apply feature selection, if necessary
        if (this.featureSelection != null) {
          feats = this.featureSelection.transform(feats);
        }


        // ensures that the (possibly) transformed feats are used
        if (instance.getOutcome()!=null) {
          this.dataWriter.write(new Instance<String>(instance.getOutcome(),feats));
        }
      }
    }
    
  }
  
  public List<Feature> extractZoneFeatures(Map<IdentifiedAnnotation, Collection<Zone>> coveringZoneMap, IdentifiedAnnotation entityOrEventMention)
  {
    final Collection<Zone> zoneList = coveringZoneMap.get(entityOrEventMention);
    
    if (zoneList == null || zoneList.isEmpty())
    {
      //logger.info("AssertionCleartkAnalysisEngine.extractZoneFeatures() early END (no zones)");
      return new ArrayList<Feature>();
    } else
    {
      logger.debug("AssertionCleartkAnalysisEngine.extractZoneFeatures() found zones and adding zone features");
    }
    
    ArrayList<Feature> featureList = new ArrayList<Feature>();
    for (Zone zone : zoneList)
    {
      Feature currentFeature = new Feature("zone", zone.getLabel());
      logger.debug(String.format("zone: %s", zone.getLabel()));
      logger.debug(String.format("zone feature: %s", currentFeature.toString()));
      featureList.add(currentFeature);
    }
    
    return featureList;
  }


  public static AnalysisEngineDescription getDescription(Object... additionalConfiguration)
        throws ResourceInitializationException {
      AnalysisEngineDescription desc = AnalysisEngineFactory.createPrimitiveDescription(AssertionCleartkAnalysisEngine.class);
      if (additionalConfiguration.length > 0) {
        ConfigurationParameterFactory.addConfigurationParameters(desc, additionalConfiguration);
      }
      return desc;
    }




  
  /*
  public static AnalysisEngineDescription getClassifierDescription(String modelFileName)
      throws ResourceInitializationException {
    return CleartkAnnotatorDescriptionFactory.createCleartkAnnotator(
        AssertionCleartkAnalysisEngine.class,
        AssertionComponents.TYPE_SYSTEM_DESCRIPTION,
        modelFileName);
  }


  public static AnalysisEngineDescription getWriterDescription(String outputDirectory)
      throws ResourceInitializationException {
    AnalysisEngineDescription aed = CleartkAnnotatorDescriptionFactory.createViterbiAnnotator(
        AssertionCleartkAnalysisEngine.class,
        AssertionComponents.TYPE_SYSTEM_DESCRIPTION,
        DefaultMaxentDataWriterFactory.class,
        outputDirectory);
    ConfigurationParameterFactory.addConfigurationParameter(
        aed,
        MaxentDataWriterFactory_ImplBase.PARAM_COMPRESS,
        true);
    return aed;
  }
  */
}
Source Code of org.apache.ctakes.assertion.medfacts.cleartk.AssertionCleartkAnalysisEngine

Related Classes of org.apache.ctakes.assertion.medfacts.cleartk.AssertionCleartkAnalysisEngine