Source Code of org.apache.ctakes.temporal.eval.Evaluation_ImplBase

/**
 * Licensed to the Apache Software Foundation (ASF) under one
 * or more contributor license agreements.  See the NOTICE file
 * distributed with this work for additional information
 * regarding copyright ownership.  The ASF licenses this file
 * to you under the Apache License, Version 2.0 (the
 * "License"); you may not use this file except in compliance
 * with the License.  You may obtain a copy of the License at
 *
 *   http://www.apache.org/licenses/LICENSE-2.0
 *
 * Unless required by applicable law or agreed to in writing,
 * software distributed under the License is distributed on an
 * "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
 * KIND, either express or implied.  See the License for the
 * specific language governing permissions and limitations
 * under the License.
 */
package org.apache.ctakes.temporal.eval;


import java.io.File;
import java.io.FileInputStream;
import java.io.FileOutputStream;
import java.io.FilenameFilter;
import java.io.IOException;
import java.util.ArrayList;
import java.util.HashMap;
import java.util.HashSet;
import java.util.List;
import java.util.Map;
import java.util.Set;
import java.util.regex.Matcher;
import java.util.regex.Pattern;


import javax.xml.parsers.DocumentBuilder;
import javax.xml.parsers.DocumentBuilderFactory;
import javax.xml.parsers.ParserConfigurationException;
import javax.xml.transform.OutputKeys;
import javax.xml.transform.Transformer;
import javax.xml.transform.TransformerConfigurationException;
import javax.xml.transform.TransformerException;
import javax.xml.transform.TransformerFactory;
import javax.xml.transform.dom.DOMSource;
import javax.xml.transform.stream.StreamResult;


import org.apache.ctakes.chunker.ae.Chunker;
import org.apache.ctakes.chunker.ae.DefaultChunkCreator;
import org.apache.ctakes.chunker.ae.adjuster.ChunkAdjuster;
import org.apache.ctakes.constituency.parser.ae.ConstituencyParser;
import org.apache.ctakes.contexttokenizer.ae.ContextDependentTokenizerAnnotator;
import org.apache.ctakes.core.ae.OverlapAnnotator;
import org.apache.ctakes.core.ae.SentenceDetector;
import org.apache.ctakes.core.ae.SimpleSegmentAnnotator;
import org.apache.ctakes.core.ae.TokenizerAnnotatorPTB;
import org.apache.ctakes.core.resource.FileLocator;
import org.apache.ctakes.core.resource.FileResourceImpl;
import org.apache.ctakes.core.resource.JdbcConnectionResourceImpl;
import org.apache.ctakes.core.resource.LuceneIndexReaderResourceImpl;
import org.apache.ctakes.dependency.parser.ae.ClearNLPDependencyParserAE;
import org.apache.ctakes.dependency.parser.ae.ClearNLPSemanticRoleLabelerAE;
import org.apache.ctakes.dictionary.lookup.ae.UmlsDictionaryLookupAnnotator;
import org.apache.ctakes.lvg.ae.LvgAnnotator;
import org.apache.ctakes.lvg.resource.LvgCmdApiResourceImpl;
import org.apache.ctakes.postagger.POSTagger;
import org.apache.ctakes.temporal.ae.I2B2TemporalXMLReader;
import org.apache.ctakes.temporal.ae.THYMEAnaforaXMLReader;
import org.apache.ctakes.temporal.ae.THYMEKnowtatorXMLReader;
import org.apache.ctakes.temporal.ae.THYMETreebankReader;
import org.apache.ctakes.typesystem.type.relation.TemporalTextRelation;
import org.apache.ctakes.typesystem.type.syntax.BaseToken;
import org.apache.ctakes.typesystem.type.syntax.Chunk;
import org.apache.ctakes.typesystem.type.syntax.TerminalTreebankNode;
import org.apache.ctakes.typesystem.type.syntax.TreebankNode;
import org.apache.ctakes.typesystem.type.textsem.EntityMention;
import org.apache.ctakes.typesystem.type.textsem.EventMention;
import org.apache.ctakes.typesystem.type.textsem.IdentifiedAnnotation;
import org.apache.ctakes.typesystem.type.textsem.TimeMention;
import org.apache.ctakes.typesystem.type.textspan.LookupWindowAnnotation;
import org.apache.ctakes.typesystem.type.textspan.Segment;
import org.apache.ctakes.typesystem.type.textspan.Sentence;
import org.apache.log4j.Logger;
import org.apache.uima.UIMAException;
import org.apache.uima.UimaContext;
import org.apache.uima.analysis_engine.AnalysisEngine;
import org.apache.uima.analysis_engine.AnalysisEngineDescription;
import org.apache.uima.analysis_engine.AnalysisEngineProcessException;
import org.apache.uima.cas.CAS;
import org.apache.uima.cas.CASException;
import org.apache.uima.cas.Feature;
import org.apache.uima.cas.impl.XmiCasDeserializer;
import org.apache.uima.cas.impl.XmiCasSerializer;
import org.apache.uima.collection.CollectionReader;
import org.apache.uima.jcas.JCas;
import org.apache.uima.jcas.cas.TOP;
import org.apache.uima.jcas.tcas.Annotation;
import org.apache.uima.resource.ResourceInitializationException;
import org.apache.uima.util.CasCopier;
import org.apache.uima.util.XMLSerializer;
import org.cleartk.util.ViewURIUtil;
import org.cleartk.util.ae.UriToDocumentTextAnnotator;
import org.cleartk.util.cr.UriCollectionReader;
import org.uimafit.component.JCasAnnotator_ImplBase;
import org.uimafit.component.ViewCreatorAnnotator;
import org.uimafit.component.ViewTextCopierAnnotator;
import org.uimafit.descriptor.ConfigurationParameter;
import org.uimafit.factory.AggregateBuilder;
import org.uimafit.factory.AnalysisEngineFactory;
import org.uimafit.factory.ExternalResourceFactory;
import org.uimafit.factory.TypePrioritiesFactory;
import org.uimafit.factory.TypeSystemDescriptionFactory;
import org.uimafit.pipeline.SimplePipeline;
import org.uimafit.util.JCasUtil;
import org.w3c.dom.Document;
import org.w3c.dom.Element;
import org.xml.sax.ContentHandler;
import org.xml.sax.SAXException;


import com.google.common.collect.Lists;
import com.lexicalscope.jewel.cli.Option;


public abstract class Evaluation_ImplBase<STATISTICS_TYPE> extends
    org.cleartk.eval.Evaluation_ImplBase<Integer, STATISTICS_TYPE> {


  private static Logger LOGGER = Logger.getLogger(Evaluation_ImplBase.class);


  public static final String GOLD_VIEW_NAME = "GoldView";
  
  enum XMLFormat { Knowtator, Anafora, I2B2 }


  static interface Options {


    @Option(longName = "text", defaultToNull = true)
    public File getRawTextDirectory();


    @Option(longName = "xml")
    public File getXMLDirectory();


    @Option(longName = "format", defaultValue="Anafora")
    public XMLFormat getXMLFormat();


    @Option(longName = "xmi")
    public File getXMIDirectory();


    @Option(longName = "patients")
    public CommandLine.IntegerRanges getPatients();
    
    @Option(longName = "treebank", defaultToNull=true)
    public File getTreebankDirectory();
    
    @Option
    public boolean getUseGoldTrees();
    
    @Option
    public boolean getGrid();
    
    @Option
    public boolean getPrintErrors();
    
    @Option
    public boolean getPrintOverlappingSpans();
    
    @Option
    public boolean getTest();


    @Option(longName = "kernelParams", defaultToNull=true)
    public String getKernelParams();
    
    @Option(defaultToNull=true)
    public String getI2B2Output();
  }


  protected File rawTextDirectory;


  protected File xmlDirectory;
  
  protected XMLFormat xmlFormat;


  protected File xmiDirectory;


  private boolean xmiExists;


  protected File treebankDirectory;
  
  protected boolean printErrors = false;
  
  protected boolean printOverlapping = false;
  
  protected String i2b2Output = null;
  
  protected String[] kernelParams;
  
  public Evaluation_ImplBase(
      File baseDirectory,
      File rawTextDirectory,
      File xmlDirectory,
      XMLFormat xmlFormat,
      File xmiDirectory,
      File treebankDirectory) {
    super(baseDirectory);
    this.rawTextDirectory = rawTextDirectory;
    this.xmlDirectory = xmlDirectory;
    this.xmlFormat = xmlFormat;
    this.xmiDirectory = xmiDirectory;
    this.xmiExists = this.xmiDirectory.exists() && this.xmiDirectory.listFiles().length > 0;
    this.treebankDirectory = treebankDirectory;
  }


  public void setI2B2Output(String outDir){
    i2b2Output = outDir;
  }
  
  public void prepareXMIsFor(List<Integer> patientSets) throws Exception {
    boolean needsXMIs = false;
    for (File textFile : this.getFilesFor(patientSets)) {
      if (!getXMIFile(this.xmiDirectory, textFile).exists()) {
        needsXMIs = true;
        break;
      }
    }
    if (needsXMIs) {
      CollectionReader reader = this.getCollectionReader(patientSets);
      AnalysisEngine engine = this.getXMIWritingPreprocessorAggregateBuilder().createAggregate();
      SimplePipeline.runPipeline(reader, engine);
    }
    this.xmiExists = true;
  }
  
  private List<File> getFilesFor(List<Integer> patientSets) {
    List<File> files = new ArrayList<File>();
    if (this.rawTextDirectory == null
        && this.xmlFormat == XMLFormat.Anafora) {
      for (File dir : this.xmlDirectory.listFiles()) {
        Set<String> ids = new HashSet<String>();
        for (Integer set : patientSets) {
          ids.add(String.format("ID%03d", set));
        }
        if (dir.isDirectory()) {
          if (ids.contains(dir.getName().substring(0, 5))) {
            File file = new File(dir, dir.getName());
            if (file.exists()) {
              files.add(file);
            } else {
              LOGGER.warn("Missing note: " + file);
            }
          } else {
            LOGGER.info("Skipping note: " + dir);
          }
        }
      }
    } else if(this.xmlFormat == XMLFormat.I2B2) {
      File trainDir = new File(this.xmlDirectory, "training");
      File testDir = new File(this.xmlDirectory, "test");
      for (Integer pt : patientSets){
        File xmlTrain = new File(trainDir, pt+".xml");
        File train = new File(trainDir, pt+".xml.txt");
        if(train.exists()){
          if(xmlTrain.exists()){
            files.add(train);
          }else{
            System.err.println("Text file in training has no corresponding xml -- skipping: " + train);
          }
        }
        File xmlText = new File(testDir, pt+".xml");
        File test = new File(testDir, pt+".xml.txt");
        if(test.exists()){
          if(xmlText.exists()){
            files.add(test);
          }else{
            System.err.println("Text file in test has no corresponding xml -- skipping: " + test);
          }
        }
        assert !(train.exists() && test.exists());
      }
    }  else  {
      for (Integer set : patientSets) {
        final int setNum = set;
        for (File file : rawTextDirectory.listFiles(new FilenameFilter(){
          @Override
          public boolean accept(File dir, String name) {
            return name.contains(String.format("ID%03d", setNum));
          }})) {
          // skip hidden files like .svn
          if (!file.isHidden()) {
            if(xmlFormat == XMLFormat.Knowtator){
              files.add(file);
            }else{
              // look for equivalent in xml directory:
              File xmlFile = new File(xmlDirectory, file.getName());
              if(xmlFile.exists()){
                files.add(file);
              }else{
                System.err.println("Missing patient file : " + xmlFile);
              }
            }
          } 
        }
      }
    }
    return files;
  }


  @Override
  protected CollectionReader getCollectionReader(List<Integer> patientSets) throws Exception {
    return UriCollectionReader.getCollectionReaderFromFiles(this.getFilesFor(patientSets));
  }


  protected AggregateBuilder getPreprocessorAggregateBuilder() throws Exception {
    return this.xmiExists
        ? this.getXMIReadingPreprocessorAggregateBuilder()
        : this.getXMIWritingPreprocessorAggregateBuilder();
  }


  protected AggregateBuilder getXMIReadingPreprocessorAggregateBuilder() throws UIMAException {
    AggregateBuilder aggregateBuilder = new AggregateBuilder();
    aggregateBuilder.add(UriToDocumentTextAnnotator.getDescription());
    aggregateBuilder.add(AnalysisEngineFactory.createPrimitiveDescription(
        XMIReader.class,
        XMIReader.PARAM_XMI_DIRECTORY,
        this.xmiDirectory));
    return aggregateBuilder;
  }


  protected AggregateBuilder getXMIWritingPreprocessorAggregateBuilder()
      throws Exception {
    AggregateBuilder aggregateBuilder = new AggregateBuilder();
    aggregateBuilder.add(UriToDocumentTextAnnotator.getDescription());


    // read manual annotations into gold view
    aggregateBuilder.add(AnalysisEngineFactory.createPrimitiveDescription(
        ViewCreatorAnnotator.class,
        ViewCreatorAnnotator.PARAM_VIEW_NAME,
        GOLD_VIEW_NAME));
    aggregateBuilder.add(AnalysisEngineFactory.createPrimitiveDescription(
        ViewTextCopierAnnotator.class,
        ViewTextCopierAnnotator.PARAM_SOURCE_VIEW_NAME,
        CAS.NAME_DEFAULT_SOFA,
        ViewTextCopierAnnotator.PARAM_DESTINATION_VIEW_NAME,
        GOLD_VIEW_NAME));
    switch (this.xmlFormat) {
    case Anafora:
      aggregateBuilder.add(
          THYMEAnaforaXMLReader.getDescription(this.xmlDirectory),
          CAS.NAME_DEFAULT_SOFA,
          GOLD_VIEW_NAME);
      break;
    case Knowtator:
      aggregateBuilder.add(
          THYMEKnowtatorXMLReader.getDescription(this.xmlDirectory),
          CAS.NAME_DEFAULT_SOFA,
          GOLD_VIEW_NAME);
      break;
    case I2B2:
      aggregateBuilder.add(
          I2B2TemporalXMLReader.getDescription(this.xmlDirectory),
          CAS.NAME_DEFAULT_SOFA,
          GOLD_VIEW_NAME);
      break;
    }


    // identify segments
    if(this.xmlFormat == XMLFormat.I2B2){
      aggregateBuilder.add(AnalysisEngineFactory.createPrimitiveDescription(SimpleSegmentAnnotator.class));
    }else{
      aggregateBuilder.add(AnalysisEngineFactory.createPrimitiveDescription(SegmentsFromBracketedSectionTagsAnnotator.class));
    }
    // identify sentences
    aggregateBuilder.add(AnalysisEngineFactory.createPrimitiveDescription(
        SentenceDetector.class,
        SentenceDetector.SD_MODEL_FILE_PARAM,
        "org/apache/ctakes/core/sentdetect/sd-med-model.zip"));
    // identify tokens
    aggregateBuilder.add(AnalysisEngineFactory.createPrimitiveDescription(TokenizerAnnotatorPTB.class));
    // merge some tokens
    aggregateBuilder.add(AnalysisEngineFactory.createPrimitiveDescription(ContextDependentTokenizerAnnotator.class));


    // identify part-of-speech tags
    aggregateBuilder.add(AnalysisEngineFactory.createPrimitiveDescription(
        POSTagger.class,
        TypeSystemDescriptionFactory.createTypeSystemDescription(),
        TypePrioritiesFactory.createTypePriorities(Segment.class, Sentence.class, BaseToken.class),
        POSTagger.POS_MODEL_FILE_PARAM,
        "org/apache/ctakes/postagger/models/mayo-pos.zip"));


    // identify chunks
    aggregateBuilder.add(AnalysisEngineFactory.createPrimitiveDescription(
        Chunker.class,
        Chunker.CHUNKER_MODEL_FILE_PARAM,
        FileLocator.locateFile("org/apache/ctakes/chunker/models/chunker-model.zip"),
        Chunker.CHUNKER_CREATOR_CLASS_PARAM,
        DefaultChunkCreator.class));


    // identify UMLS named entities


    // adjust NP in NP NP to span both
    aggregateBuilder.add(AnalysisEngineFactory.createPrimitiveDescription(
        ChunkAdjuster.class,
        ChunkAdjuster.PARAM_CHUNK_PATTERN,
        new String[] { "NP", "NP" },
        ChunkAdjuster.PARAM_EXTEND_TO_INCLUDE_TOKEN,
        1));
    // adjust NP in NP PP NP to span all three
    aggregateBuilder.add(AnalysisEngineFactory.createPrimitiveDescription(
        ChunkAdjuster.class,
        ChunkAdjuster.PARAM_CHUNK_PATTERN,
        new String[] { "NP", "PP", "NP" },
        ChunkAdjuster.PARAM_EXTEND_TO_INCLUDE_TOKEN,
        2));
    // add lookup windows for each NP
    aggregateBuilder.add(AnalysisEngineFactory.createPrimitiveDescription(CopyNPChunksToLookupWindowAnnotations.class));
    // maximize lookup windows
    aggregateBuilder.add(AnalysisEngineFactory.createPrimitiveDescription(
        OverlapAnnotator.class,
        "A_ObjectClass",
        LookupWindowAnnotation.class,
        "B_ObjectClass",
        LookupWindowAnnotation.class,
        "OverlapType",
        "A_ENV_B",
        "ActionType",
        "DELETE",
        "DeleteAction",
        new String[] { "selector=B" }));
    // add UMLS on top of lookup windows
    aggregateBuilder.add(
        UmlsDictionaryLookupAnnotator.createAnnotatorDescription()
        );


    // add lvg annotator
    String[] XeroxTreebankMap = {
        "adj|JJ",
        "adv|RB",
        "aux|AUX",
        "compl|CS",
        "conj|CC",
        "det|DET",
        "modal|MD",
        "noun|NN",
        "prep|IN",
        "pron|PRP",
        "verb|VB" };
    String[] ExclusionSet = {
        "and",
        "And",
        "by",
        "By",
        "for",
        "For",
        "in",
        "In",
        "of",
        "Of",
        "on",
        "On",
        "the",
        "The",
        "to",
        "To",
        "with",
        "With" };
    AnalysisEngineDescription lvgAnnotator = AnalysisEngineFactory.createPrimitiveDescription(
        LvgAnnotator.class,
        "UseSegments",
        false,
        "SegmentsToSkip",
        new String[0],
        "UseCmdCache",
        false,
        "CmdCacheFileLocation",
        "/org/apache/ctakes/lvg/2005_norm.voc",
        "CmdCacheFrequencyCutoff",
        20,
        "ExclusionSet",
        ExclusionSet,
        "XeroxTreebankMap",
        XeroxTreebankMap,
        "LemmaCacheFileLocation",
        "/org/apache/ctakes/lvg/2005_lemma.voc",
        "UseLemmaCache",
        false,
        "LemmaCacheFrequencyCutoff",
        20,
        "PostLemmas",
        false,
        "LvgCmdApi",
        ExternalResourceFactory.createExternalResourceDescription(
            LvgCmdApiResourceImpl.class,
            new File(LvgCmdApiResourceImpl.class.getResource(
                "/org/apache/ctakes/lvg/data/config/lvg.properties").toURI())));
    aggregateBuilder.add(lvgAnnotator);


    // add dependency parser
    aggregateBuilder.add(AnalysisEngineFactory.createPrimitiveDescription(ClearNLPDependencyParserAE.class));


    // add semantic role labeler
    aggregateBuilder.add(AnalysisEngineFactory.createPrimitiveDescription(ClearNLPSemanticRoleLabelerAE.class));


    // add gold standard parses to gold view, and adjust gold view to correct a few annotation mis-steps
    if(this.treebankDirectory != null){
      aggregateBuilder.add(THYMETreebankReader.getDescription(this.treebankDirectory));
      aggregateBuilder.add(AnalysisEngineFactory.createPrimitiveDescription(TimexAnnotationCorrector.class));
    }else{
      // add ctakes constituency parses to system view
      aggregateBuilder.add(AnalysisEngineFactory.createPrimitiveDescription(ConstituencyParser.class,
          ConstituencyParser.PARAM_MODEL_FILENAME,
          "org/apache/ctakes/constituency/parser/models/sharpacq-3.1.bin"));
//      aggregateBuilder.add(AnalysisEngineFactory.createPrimitiveDescription(BerkeleyParserWrapper.class,
//          BerkeleyParserWrapper.PARAM_MODEL_FILENAME,
//          
//        "org/apache/ctakes/constituency/parser/models/thyme.gcg.4sm.bin"));
//          "org/apache/ctakes/constituency/parser/models/thyme.4sm.bin"));
    }
    // write out the CAS after all the above annotations
    aggregateBuilder.add(AnalysisEngineFactory.createPrimitiveDescription(
        XMIWriter.class,
        XMIWriter.PARAM_XMI_DIRECTORY,
        this.xmiDirectory));


    return aggregateBuilder;
  }


  public static <T extends Annotation> List<T> selectExact(JCas jCas, Class<T> annotationClass, Segment segment) {
    List<T> annotations = Lists.newArrayList();
    for (T annotation : JCasUtil.selectCovered(jCas, annotationClass, segment)) {
      if (annotation.getClass().equals(annotationClass)) {
        annotations.add(annotation);
      }
    }
    return annotations;
  }


  public static class CopyNPChunksToLookupWindowAnnotations extends JCasAnnotator_ImplBase {


    @Override
    public void process(JCas jCas) throws AnalysisEngineProcessException {
      for (Chunk chunk : JCasUtil.select(jCas, Chunk.class)) {
        if (chunk.getChunkType().equals("NP")) {
          new LookupWindowAnnotation(jCas, chunk.getBegin(), chunk.getEnd()).addToIndexes();
        }
      }
    }
  }
  
  public static class RemoveEnclosedLookupWindows extends JCasAnnotator_ImplBase {


    @Override
    public void process(JCas jCas) throws AnalysisEngineProcessException {
      List<LookupWindowAnnotation> lws = new ArrayList<LookupWindowAnnotation>(JCasUtil.select(jCas, LookupWindowAnnotation.class));
      // we'll navigate backwards so that as we delete things we shorten the list from the back
      for(int i = lws.size()-2; i >= 0; i--){
        LookupWindowAnnotation lw1 = lws.get(i);
        LookupWindowAnnotation lw2 = lws.get(i+1);
        if(lw1.getBegin() <= lw2.getBegin() && lw1.getEnd() >= lw2.getEnd()){
          /// lw1 envelops or encloses lw2
          lws.remove(i+1);
          lw2.removeFromIndexes();
        }
      }
      
    }
    
  }


  public static class EntityMentionRemover extends JCasAnnotator_ImplBase {


    @Override
    public void process(JCas jCas) throws AnalysisEngineProcessException {
      for (EntityMention mention : Lists.newArrayList(JCasUtil.select(jCas, EntityMention.class))) {
        mention.removeFromIndexes();
      }
    }
  }


  public static class EventMentionRemover extends JCasAnnotator_ImplBase {


    @Override
    public void process(JCas jCas) throws AnalysisEngineProcessException {
      for (EventMention mention : Lists.newArrayList(JCasUtil.select(jCas, EventMention.class))) {
        mention.removeFromIndexes();
      }
    }
  }
  
  // replace this with SimpleSegmentWithTagsAnnotator if that code ever gets fixed
  public static class SegmentsFromBracketedSectionTagsAnnotator extends JCasAnnotator_ImplBase {
    private static Pattern SECTION_PATTERN = Pattern.compile(
        "(\\[start section id=\"?(.*?)\"?\\]).*?(\\[end section id=\"?(.*?)\"?\\])",
        Pattern.DOTALL);


    @Override
    public void process(JCas jCas) throws AnalysisEngineProcessException {
      Matcher matcher = SECTION_PATTERN.matcher(jCas.getDocumentText());
      while (matcher.find()) {
        Segment segment = new Segment(jCas);
        segment.setBegin(matcher.start() + matcher.group(1).length());
        segment.setEnd(matcher.end() - matcher.group(3).length());
        segment.setId(matcher.group(2));
        segment.addToIndexes();
      }
    }
  }


  static File getXMIFile(File xmiDirectory, File textFile) {
    return new File(xmiDirectory, textFile.getName() + ".xmi");
  }


  static File getXMIFile(File xmiDirectory, JCas jCas) throws AnalysisEngineProcessException {
    return getXMIFile(xmiDirectory, new File(ViewURIUtil.getURI(jCas).getPath()));
  }


  public static class XMIWriter extends JCasAnnotator_ImplBase {


    public static final String PARAM_XMI_DIRECTORY = "XMIDirectory";


    @ConfigurationParameter(name = PARAM_XMI_DIRECTORY, mandatory = true)
    private File xmiDirectory;


    @Override
    public void initialize(UimaContext context) throws ResourceInitializationException {
      super.initialize(context);
      if (!this.xmiDirectory.exists()) {
        this.xmiDirectory.mkdirs();
      }
    }


    @Override
    public void process(JCas jCas) throws AnalysisEngineProcessException {
      File xmiFile = getXMIFile(this.xmiDirectory, jCas);
      try {
        FileOutputStream outputStream = new FileOutputStream(xmiFile);
        try {
          XmiCasSerializer serializer = new XmiCasSerializer(jCas.getTypeSystem());
          ContentHandler handler = new XMLSerializer(outputStream, false).getContentHandler();
          serializer.serialize(jCas.getCas(), handler);
        } finally {
          outputStream.close();
        }
      } catch (SAXException e) {
        throw new AnalysisEngineProcessException(e);
      } catch (IOException e) {
        throw new AnalysisEngineProcessException(e);
      }
    }
  }


  public static class XMIReader extends JCasAnnotator_ImplBase {


    public static final String PARAM_XMI_DIRECTORY = "XMIDirectory";


    @ConfigurationParameter(name = PARAM_XMI_DIRECTORY, mandatory = true)
    private File xmiDirectory;


    @Override
    public void process(JCas jCas) throws AnalysisEngineProcessException {
      File xmiFile = getXMIFile(this.xmiDirectory, jCas);
      try {
        FileInputStream inputStream = new FileInputStream(xmiFile);
        try {
          XmiCasDeserializer.deserialize(inputStream, jCas.getCas());
        } finally {
          inputStream.close();
        }
      } catch (SAXException e) {
        throw new AnalysisEngineProcessException(e);
      } catch (IOException e) {
        throw new AnalysisEngineProcessException(e);
      }
    }
  }
  
  public static class TimexAnnotationCorrector extends JCasAnnotator_ImplBase {
    @Override
    public void process(JCas jCas) throws AnalysisEngineProcessException {
      JCas goldView, systemView;
      try {
        goldView = jCas.getView(GOLD_VIEW_NAME);
        systemView = jCas.getView(CAS.NAME_DEFAULT_SOFA);
      } catch (CASException e) {
        e.printStackTrace();
        throw new AnalysisEngineProcessException();
      }
      for(TimeMention mention : JCasUtil.select(goldView, TimeMention.class)){
        // for each time expression, get the treebank node with the same span.
        List<TreebankNode> nodes = JCasUtil.selectCovered(systemView, TreebankNode.class, mention);
        TreebankNode sameSpanNode = null;
        for(TreebankNode node : nodes){
          if(node.getBegin() == mention.getBegin() && node.getEnd() == mention.getEnd()){
            sameSpanNode = node;
            break;
          }
        }
        if(sameSpanNode != null){
          // look at node at the position of the timex3.
          if(sameSpanNode.getNodeType().equals("PP")){
            // if it is a PP it should be moved down to the NP
            int numChildren = sameSpanNode.getChildren().size();
            if(numChildren == 2 && sameSpanNode.getChildren(0).getNodeType().equals("IN") && sameSpanNode.getChildren(1).getNodeType().equals("NP")){
              // move the time span to this node:
              TreebankNode mentionNode = sameSpanNode.getChildren(numChildren-1);
              mention.setBegin(mentionNode.getBegin());
              mention.setEnd(mentionNode.getEnd());
            }
          }
        }else{
          // if there is no matching tree span, see if the DT to the left would help.
          // now adjust for missing DT to the left
          List<TerminalTreebankNode> precedingPreterms = JCasUtil.selectPreceding(systemView, TerminalTreebankNode.class, mention, 1);
          if(precedingPreterms != null && precedingPreterms.size() == 1){
            TerminalTreebankNode leftTerm = precedingPreterms.get(0);
            if(leftTerm.getNodeType().equals("DT")){
              // now see if adding this would make it match a tree
              List<TreebankNode> matchingNodes = JCasUtil.selectCovered(systemView, TreebankNode.class, leftTerm.getBegin(), mention.getEnd());
              for(TreebankNode node : matchingNodes){
                if(node.getBegin() == leftTerm.getBegin() && node.getEnd() == mention.getEnd()){
                  sameSpanNode = node;
                  break;
                }
              }
              if(sameSpanNode != null){
                // adding the DT to the left of th emention made it match a tree:
                System.err.println("Adding DT: " + leftTerm.getCoveredText() + " to TIMEX: " + mention.getCoveredText());
                mention.setBegin(leftTerm.getBegin());
              }
            }
          }
        }
      }
    }
  }




  public static class CopyFromGold extends JCasAnnotator_ImplBase {


    public static AnalysisEngineDescription getDescription(Class<?>... classes)
        throws ResourceInitializationException {
      return AnalysisEngineFactory.createPrimitiveDescription(
          CopyFromGold.class,
          CopyFromGold.PARAM_ANNOTATION_CLASSES,
          classes);
    }


    public static final String PARAM_ANNOTATION_CLASSES = "AnnotationClasses";


    @ConfigurationParameter(name = PARAM_ANNOTATION_CLASSES, mandatory = true)
    private Class<? extends TOP>[] annotationClasses;


    @Override
    public void process(JCas jCas) throws AnalysisEngineProcessException {
      JCas goldView, systemView;
      try {
        goldView = jCas.getView(GOLD_VIEW_NAME);
        systemView = jCas.getView(CAS.NAME_DEFAULT_SOFA);
      } catch (CASException e) {
        throw new AnalysisEngineProcessException(e);
      }
      for (Class<? extends TOP> annotationClass : this.annotationClasses) {
        for (TOP annotation : Lists.newArrayList(JCasUtil.select(systemView, annotationClass))) {
          if (annotation.getClass().equals(annotationClass)) {
            annotation.removeFromIndexes();
          }
        }
      }
      CasCopier copier = new CasCopier(goldView.getCas(), systemView.getCas());
      Feature sofaFeature = jCas.getTypeSystem().getFeatureByFullName(CAS.FEATURE_FULL_NAME_SOFA);
      for (Class<? extends TOP> annotationClass : this.annotationClasses) {
        for (TOP annotation : JCasUtil.select(goldView, annotationClass)) {
          TOP copy = (TOP) copier.copyFs(annotation);
          if (copy instanceof Annotation) {
            copy.setFeatureValue(sofaFeature, systemView.getSofa());
          }
          copy.addToIndexes(systemView);
        }
      }
    }
  }
  
  public static class WriteI2B2XML extends JCasAnnotator_ImplBase {
    public static final String PARAM_OUTPUT_DIR="PARAM_OUTPUT_DIR";
    @ConfigurationParameter(mandatory=true,description="Output directory to write xml files to.",name=PARAM_OUTPUT_DIR)
    protected String outputDir;
    
    @Override
    public void process(JCas jcas) throws AnalysisEngineProcessException {
      try {
        // get the output file name from the input file name and output directory.
        File outDir = new File(outputDir);
        if(!outDir.exists()) outDir.mkdirs();
        File inFile = new File(ViewURIUtil.getURI(jcas));
        String outFile = inFile.getName().replace(".txt", "");
        
        // build the xml
        DocumentBuilderFactory docFactory = DocumentBuilderFactory.newInstance();
        DocumentBuilder docBuilder = docFactory.newDocumentBuilder();
        Document doc = docBuilder.newDocument();
        Element rootElement = doc.createElement("ClinicalNarrativeTemporalAnnotation");
        Element textElement = doc.createElement("TEXT");
        Element tagsElement = doc.createElement("TAGS");
        textElement.setTextContent(jcas.getDocumentText());
        rootElement.appendChild(textElement);
        rootElement.appendChild(tagsElement);
        doc.appendChild(rootElement);
        
        Map<IdentifiedAnnotation,String> argToId = new HashMap<>();
        int id=0;
        for(TimeMention timex : JCasUtil.select(jcas, TimeMention.class)){
          Element timexElement = doc.createElement("TIMEX3");
          String timexID = "T"+id; id++;
          argToId.put(timex, timexID);
          timexElement.setAttribute("id", timexID);
          timexElement.setAttribute("start", String.valueOf(timex.getBegin()+1));
          timexElement.setAttribute("end", String.valueOf(timex.getEnd()+1));
          timexElement.setAttribute("text", timex.getCoveredText());
          timexElement.setAttribute("type", "NA");
          timexElement.setAttribute("val", "NA");
          timexElement.setAttribute("mod", "NA");
          tagsElement.appendChild(timexElement);
        }
        
        id = 0;
        for(EventMention event : JCasUtil.select(jcas, EventMention.class)){
          if (event.getClass().equals(EventMention.class)) {
            // this ensures we are only looking at THYME events and not ctakes-dictionary-lookup events
            Element eventEl = doc.createElement("EVENT");
            String eventID = "E"+id;  id++;
            argToId.put(event, eventID);
            eventEl.setAttribute("id", eventID);
            eventEl.setAttribute("start", String.valueOf(event.getBegin()+1));
            eventEl.setAttribute("end", String.valueOf(event.getEnd()+1));
            eventEl.setAttribute("text", event.getCoveredText());
            eventEl.setAttribute("modality", "NA");
            eventEl.setAttribute("polarity", "NA");
            eventEl.setAttribute("type", "NA");
            tagsElement.appendChild(eventEl);
          }
        }
        
        id = 0;
        for(TemporalTextRelation rel : JCasUtil.select(jcas, TemporalTextRelation.class)){
          Element linkEl = doc.createElement("TLINK");
          String linkID = "TL"+id; id++;
          linkEl.setAttribute("id", linkID);
          Annotation arg1 = rel.getArg1().getArgument();
          linkEl.setAttribute("fromID", argToId.get(arg1));
          linkEl.setAttribute("fromText", arg1.getCoveredText());
          Annotation arg2 = rel.getArg2().getArgument();
          linkEl.setAttribute("toID", argToId.get(arg2));
          linkEl.setAttribute("toText", arg2.getCoveredText());
          linkEl.setAttribute("type", rel.getCategory());
          tagsElement.appendChild(linkEl);
        }
        
        // boilerplate xml-writing code:
        TransformerFactory transformerFactory = TransformerFactory.newInstance();
        Transformer transformer = transformerFactory.newTransformer();
        transformer.setOutputProperty(OutputKeys.INDENT, "yes");
        transformer.setOutputProperty(OutputKeys.METHOD, "xml");
        DOMSource source = new DOMSource(doc);
        StreamResult result = new StreamResult(new File(outputDir, outFile));
        transformer.transform(source, result);
      } catch (ParserConfigurationException e) {
        e.printStackTrace();
        throw new AnalysisEngineProcessException(e);
      } catch (TransformerConfigurationException e) {
        e.printStackTrace();
        throw new AnalysisEngineProcessException(e);
      } catch (TransformerException e) {
        e.printStackTrace();
        throw new AnalysisEngineProcessException(e);
      }


    }
    
  }
}
Source Code of org.apache.ctakes.temporal.eval.Evaluation_ImplBase

Related Classes of org.apache.ctakes.temporal.eval.Evaluation_ImplBase