Source Code of org.apache.ctakes.temporal.duration.EventDurationDistribution$TemporalDurationExtractor

/**
 * Licensed to the Apache Software Foundation (ASF) under one
 * or more contributor license agreements.  See the NOTICE file
 * distributed with this work for additional information
 * regarding copyright ownership.  The ASF licenses this file
 * to you under the Apache License, Version 2.0 (the
 * "License"); you may not use this file except in compliance
 * with the License.  You may obtain a copy of the License at
 *
 *   http://www.apache.org/licenses/LICENSE-2.0
 *
 * Unless required by applicable law or agreed to in writing,
 * software distributed under the License is distributed on an
 * "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
 * KIND, either express or implied.  See the License for the
 * specific language governing permissions and limitations
 * under the License.
 */
package org.apache.ctakes.temporal.duration;


import java.io.File;
import java.io.IOException;
import java.util.Arrays;
import java.util.Collection;
import java.util.Collections;
import java.util.Comparator;
import java.util.HashSet;
import java.util.LinkedList;
import java.util.List;
import java.util.Map;
import java.util.regex.Matcher;
import java.util.regex.Pattern;


import org.apache.ctakes.relationextractor.eval.XMIReader;
import org.apache.ctakes.typesystem.type.structured.DocumentID;
import org.apache.ctakes.typesystem.type.syntax.BaseToken;
import org.apache.ctakes.typesystem.type.textsem.EventMention;
import org.apache.ctakes.typesystem.type.textsem.MedicationMention;
import org.apache.ctakes.typesystem.type.textsem.TimeMention;
import org.apache.uima.UimaContext;
import org.apache.uima.analysis_engine.AnalysisEngine;
import org.apache.uima.analysis_engine.AnalysisEngineProcessException;
import org.apache.uima.collection.CollectionReader;
import org.apache.uima.jcas.JCas;
import org.apache.uima.jcas.tcas.Annotation;
import org.apache.uima.resource.ResourceInitializationException;
import org.kohsuke.args4j.CmdLineParser;
import org.kohsuke.args4j.Option;
import org.uimafit.component.JCasAnnotator_ImplBase;
import org.uimafit.descriptor.ConfigurationParameter;
import org.uimafit.factory.AnalysisEngineFactory;
import org.uimafit.factory.CollectionReaderFactory;
import org.uimafit.pipeline.SimplePipeline;
import org.uimafit.util.JCasUtil;


import com.google.common.base.Charsets;
import com.google.common.base.Joiner;
import com.google.common.collect.HashMultiset;
import com.google.common.collect.ImmutableMap;
import com.google.common.collect.Multiset;
import com.google.common.io.Files;


/**
 * Extract durations of event mentions (e.g. sign/symptom or disease/disorder).
 * 
 * @author dmitriy dligach
 */
public class EventDurationDistribution {


  private static Class<? extends EventMention> targetClass = MedicationMention.class;
  
  public static class Options  {
    @Option(
        name = "--input-dir",
        usage = "specify the path to the directory containing the xmi files",
        required = true)
    public File inputDirectory;
    
    @Option(
        name = "--output-file",
        usage = "specify the path to the output file",
        required = true)
    public String outputFile;
  }
  
  public static void main(String[] args) throws Exception {
    
    Options options = new Options();
    CmdLineParser parser = new CmdLineParser(options);
    parser.parseArgument(args);
    
    
    List<File> trainFiles = Arrays.asList(options.inputDirectory.listFiles());
    CollectionReader collectionReader = getCollectionReader(trainFiles);
    
    AnalysisEngine temporalDurationExtractor = AnalysisEngineFactory.createPrimitive(
        TemporalDurationExtractor.class,
        "OutputFile",
        options.outputFile);
        
    SimplePipeline.runPipeline(collectionReader, temporalDurationExtractor);
  }
  
  public static class TemporalDurationExtractor extends JCasAnnotator_ImplBase {
    
    @ConfigurationParameter(
        name = "OutputFile",
        mandatory = true,
        description = "path to the output file that will store the distributions")
    private String outputFilePath;
    private File outputFile;
    
    // regular expression to match temporal durations in time mention annotations
    private final static String regex = "(sec|min|hour|hrs|day|week|wk|month|year|yr|decade)";
    
    // mapping between time units and their normalized forms
    private final static Map<String, String> abbreviationToTimeUnit = ImmutableMap.<String, String>builder()
        .put("sec", "second")
        .put("min", "minute")
        .put("hour", "hour")
        .put("hrs", "hour")
        .put("day", "day")
        .put("week", "week")
        .put("wk", "week")
        .put("month", "month")
        .put("year", "year")
        .put("yr", "year")
        .put("decade", "decade")
        .build(); 
    
    // max distance between an event and the time mention that defines the event's duration
    private final static int MAXDISTANCE = 2;


    // regex to match different time units (e.g. 'day', 'month')
    Pattern pattern = Pattern.compile(regex, Pattern.CASE_INSENSITIVE);
    
    @Override
    public void initialize(UimaContext context) throws ResourceInitializationException  {
      super.initialize(context);
      outputFile = new File(outputFilePath);
      if(outputFile.exists()) {
        System.out.println(outputFile + " exists... deleting...");
        outputFile.delete();
      }
    }
    
    
    @Override
    public void process(JCas jCas) throws AnalysisEngineProcessException {


      Collection<DocumentID> ids = JCasUtil.select(jCas, DocumentID.class);
      String fileName = ids.iterator().next().getDocumentID();
      String mentionText = fileName.split("\\.")[0]; // e.g. "smoker.txt"


      // counts of different time units for this sign/symptom
      Multiset<String> durationDistribution = HashMultiset.create();


      for(EventMention mention : JCasUtil.select(jCas, targetClass)) {
        if(mention.getCoveredText().equals(mentionText)) {
          if(isNegated(jCas, mention) || isMedicationPattern(jCas, mention)) {
            continue;
          }


          TimeMention nearestTimeMention = getNearestTimeMention(jCas, mention);
          if(nearestTimeMention == null) {
            continue;
          }
          
          // try to parse this timex with Bethard normalizer
          HashSet<String> timeUnits = Utils.getTimeUnits(nearestTimeMention.getCoveredText());
          if(timeUnits.size() > 0) {
            for(String timeUnit : timeUnits) {
              durationDistribution.add(timeUnit);
            }
          } else {
            // could be an abbreviation e.g. "wks"
            Matcher matcher = pattern.matcher(nearestTimeMention.getCoveredText());
            // need a loop to handle things like 'several days/weeks'
            while(matcher.find()) {
              String matchedTimeUnit = matcher.group(); // e.g. "wks"
              String normalizedTimeUnit = abbreviationToTimeUnit.get(matchedTimeUnit);
              durationDistribution.add(normalizedTimeUnit);
            }            
          }
        }
      }


      if(durationDistribution.size() > 0) { 
        try {
          Files.append(Utils.formatDistribution(mentionText, durationDistribution, ", ", false) + "\n", outputFile, Charsets.UTF_8);
        } catch (IOException e) {
          System.out.println("Could not open output file: " + outputFile);
        } 
      } else {
        System.out.println("No duration data for: " + mentionText);
      }
    }
    
    /**
     * Return true if sign/symptom is negated.
     * TODO: using rules for now; switch to using a negation module
     */
    private static boolean isNegated(JCas jCas, EventMention mention) {
      
      for(BaseToken token : JCasUtil.selectPreceding(jCas, BaseToken.class, mention, 3)) {
        if(token.getCoveredText().equals("no") || 
           token.getCoveredText().equals("not") || 
           token.getCoveredText().equals("off")) {
          return true;
        }
      }
      
      return false;
    }


    /**
     * Return true of this is a medication pattern. 
     * E.g. five (5) ml po qid  (4 times a day) as needed for heartburn for 2 weeks.
     */
    private static boolean isMedicationPattern(JCas jCas, EventMention mention) {
      
      for(BaseToken token : JCasUtil.selectPreceding(jCas, BaseToken.class, mention, 1)) {
        if(token.getCoveredText().equals("for")) {
          return true;
        }
      }
           
      return false;
    }
    
    /**
     * Find nearest time mention on the right that is within allowable distance. 
     * Return null if none found.
     */
    private static TimeMention getNearestTimeMention(JCas jCas, EventMention mention) {


      List<TimeMention> timeMentions = JCasUtil.selectFollowing(jCas, TimeMention.class, mention, 1);
      if(timeMentions.size() < 1) {
        return null;
      }
      
      assert timeMentions.size() == 1;
      
      TimeMention nearestTimeMention = timeMentions.get(0);
      int distance = JCasUtil.selectBetween(jCas, BaseToken.class, mention, nearestTimeMention).size();
      if(distance > MAXDISTANCE) {
        return null;
      }
      
      return nearestTimeMention;
    }
    
    @SuppressWarnings("unused")
    private static String getAnnotationContext(Annotation annotation, int maxContextWindowSize) {
      
      String text = annotation.getCAS().getDocumentText();
      int begin = Math.max(0, annotation.getBegin() - maxContextWindowSize);
      int end = Math.min(text.length(), annotation.getEnd() + maxContextWindowSize);
      
      return text.substring(begin, end).replaceAll("[\r\n]", " ");
    }
    
    @SuppressWarnings("unused")
    private static String formatDistribution(Multiset<String> durationDistribution) {
      
      List<String> durationBins = Arrays.asList("second", "minute", "hour", "day", "week", "month", "year", "decade");
      List<Integer> durationValues = new LinkedList<Integer>();
      
      for(String durationBin : durationBins) {
        durationValues.add(durationDistribution.count(durationBin));
      }


      Joiner joiner = Joiner.on(',');
      return joiner.join(durationValues);
    }
  }
  
  private static CollectionReader getCollectionReader(List<File> items) throws Exception {


    String[] paths = new String[items.size()];
    Collections.sort(items, new FileSizeComparator());
    for (int i = 0; i < paths.length; ++i) {
      paths[i] = items.get(i).getPath();
    }
    
    return CollectionReaderFactory.createCollectionReader(
        XMIReader.class,
        XMIReader.PARAM_FILES,
        paths);
  }
  
  public static class FileSizeComparator implements Comparator<File> {


    @Override
    public int compare(File o1, File o2) {
      if(o1.length() > o2.length()){
        return 1;
      }else if(o1.length() < o2.length()){
        return -1;
      }else{
        return 0;
      }
    } 
  }
}
Source Code of org.apache.ctakes.temporal.duration.EventDurationDistribution$TemporalDurationExtractor

Related Classes of org.apache.ctakes.temporal.duration.EventDurationDistribution$TemporalDurationExtractor