Source Code of org.apache.ctakes.temporal.duration.Utils$Callback

/**
 * Licensed to the Apache Software Foundation (ASF) under one
 * or more contributor license agreements.  See the NOTICE file
 * distributed with this work for additional information
 * regarding copyright ownership.  The ASF licenses this file
 * to you under the Apache License, Version 2.0 (the
 * "License"); you may not use this file except in compliance
 * with the License.  You may obtain a copy of the License at
 *
 *   http://www.apache.org/licenses/LICENSE-2.0
 *
 * Unless required by applicable law or agreed to in writing,
 * software distributed under the License is distributed on an
 * "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
 * KIND, either express or implied.  See the License for the
 * specific language governing permissions and limitations
 * under the License.
 */
package org.apache.ctakes.temporal.duration;


import info.bethard.timenorm.Period;
import info.bethard.timenorm.PeriodSet;
import info.bethard.timenorm.Temporal;
import info.bethard.timenorm.TemporalExpressionParser;
import info.bethard.timenorm.TimeSpan;
import info.bethard.timenorm.TimeSpanSet;


import java.io.File;
import java.io.FilenameFilter;
import java.io.IOException;
import java.io.InputStream;
import java.net.URL;
import java.util.ArrayList;
import java.util.Arrays;
import java.util.HashMap;
import java.util.HashSet;
import java.util.LinkedList;
import java.util.List;
import java.util.Map;


import org.apache.ctakes.core.cr.XMIReader;
import org.apache.ctakes.core.resource.FileLocator;
import org.apache.ctakes.temporal.ae.feature.duration.DurationEventTimeFeatureExtractor;
import org.apache.ctakes.typesystem.type.syntax.BaseToken;
import org.apache.ctakes.typesystem.type.textsem.EventMention;
import org.apache.uima.analysis_engine.AnalysisEngineProcessException;
import org.apache.uima.cas.CASException;
import org.apache.uima.collection.CollectionReader;
import org.apache.uima.jcas.JCas;
import org.apache.uima.jcas.tcas.Annotation;
import org.cleartk.classifier.Feature;
import org.threeten.bp.temporal.TemporalField;
import org.threeten.bp.temporal.TemporalUnit;
import org.uimafit.factory.CollectionReaderFactory;
import org.uimafit.util.JCasUtil;


import scala.collection.immutable.Set;
import scala.util.Try;


import com.google.common.base.Charsets;
import com.google.common.base.Joiner;
import com.google.common.collect.ImmutableMap;
import com.google.common.collect.Multiset;
import com.google.common.io.Files;
import com.google.common.io.LineProcessor;
import com.googlecode.clearnlp.engine.EngineGetter;
import com.googlecode.clearnlp.morphology.AbstractMPAnalyzer;
import com.googlecode.clearnlp.reader.AbstractReader;


/**
 * Various useful classes and methods for evaluating event duration data.
 */
public class Utils {


  // events and their duration distributions
  public static final String durationDistributionPath = "/Users/dima/Boston/Thyme/Duration/Data/Combined/Distribution/all.txt";
  
  // eight bins over which we define a duration distribution
  public static final String[] bins = {"second", "minute", "hour", "day", "week", "month", "year", "decade"};
  
  /**
   * Extract time unit(s) from a temporal expression 
   * and put in one of the eight bins above.
   * Return empty set if time units could not be extracted.
   * E.g. July 5, 1984 -> day
   */
  public static HashSet<String> getTimeUnits(String timex) {
   
    HashSet<String> timeUnits = new HashSet<>();    
    Set<TemporalUnit> units = runTimexParser(timex.toLowerCase());
    if(units == null) {
      return timeUnits;
    }
    
    scala.collection.Iterator<TemporalUnit> iterator = units.iterator();
    while(iterator.hasNext()) {
      TemporalUnit unit = iterator.next();
      String bin = putInBin(unit.getName());
      if(bin != null) {
        timeUnits.add(bin);    
      }
    }
    
    return timeUnits;
  }
  
  /**
   * Use Bethard normalizer to map a temporal expression to a time unit.
   */
  public static Set<TemporalUnit> runTimexParser(String timex) {


    URL grammarURL = DurationEventTimeFeatureExtractor.class.getResource("/info/bethard/timenorm/en.grammar");
    TemporalExpressionParser parser = new TemporalExpressionParser(grammarURL);
    TimeSpan anchor = TimeSpan.of(2013, 12, 16);
    Try<Temporal> result = parser.parse(timex, anchor);


    Set<TemporalUnit> units = null;
    if (result.isSuccess()) {
      Temporal temporal = result.get();


      if (temporal instanceof Period) {
        units = ((Period) temporal).unitAmounts().keySet();
      } else if (temporal instanceof PeriodSet) {
        units = ((PeriodSet) temporal).period().unitAmounts().keySet();
      } else if (temporal instanceof TimeSpan) {
        units = ((TimeSpan) temporal).period().unitAmounts().keySet();
      } else if (temporal instanceof TimeSpanSet) {
        Set<TemporalField> fields = ((TimeSpanSet) temporal).fields().keySet();
        units = null; // fill units by calling .getBaseUnit() on each field
      }
    }
    
    return units;
  }
  
  /**
   * Take the time unit from Bethard noramlizer
   * and return a coarser time unit, i.e. one of the eight bins.
   * Return null, if this cannot be done. 
   */
  public static String putInBin(String timeUnit) {
    
    HashSet<String> allowableTimeUnits = new HashSet<>(Arrays.asList(bins));
    
    // e.g. Years -> year
    String singularAndLowercased = timeUnit.substring(0, timeUnit.length() - 1).toLowerCase();


    // is this one of the bins already?
    if(allowableTimeUnits.contains(singularAndLowercased)) {
      return singularAndLowercased;
    } 


    // units that Betard normalizer outputs mapped to one of the eight bins
    Map<String, String> mapping = ImmutableMap.<String, String>builder()
        .put("afternoon", "hour")
        .put("evening", "hour")
        .put("morning", "hour")
        .put("night", "hour")
        .put("fall", "month")
        .put("winter", "month")
        .put("spring", "month")
        .put("summer", "month")
        .put("quarteryear", "month")
        .build(); 
    
    // it's not one of the bins; can we map to to a bin?
    if(mapping.get(singularAndLowercased) != null) {
      return mapping.get(singularAndLowercased);
    }


    // we couldn't map it to a bin
    return null;
  }
  
  /**
   * Compute expected duration in seconds. Normalize by number of seconds in a decade.
   */
  public static float expectedDuration(Map<String, Float> distribution) {
    
    // unit of time -> duration in seconds
    final Map<String, Integer> timeUnitInSeconds = ImmutableMap.<String, Integer>builder()
        .put("second", 1)
        .put("minute", 60)
        .put("hour", 60 * 60)
        .put("day", 60 * 60 * 24)
        .put("week", 60 * 60 * 24 * 7)
        .put("month", 60 * 60 * 24 * 30)
        .put("year", 60 * 60 * 24 * 365)
        .put("decade", 60 * 60 * 24 * 365 * 10)
        .build();


    float expectation = 0f;
    for(String unit : distribution.keySet()) {
      expectation = expectation + (timeUnitInSeconds.get(unit) * distribution.get(unit));
    }
  
    return expectation / timeUnitInSeconds.get("decade");
  }
  
  /**
   * Take a time unit and return a probability distribution
   * in which p(this time unit) = 1 and all others are zero.
   * Assume time unit is one of the eight duration bins.
   */
  public static Map<String, Float> convertToDistribution(String timeUnit) {
    
    Map<String, Float> distribution = new HashMap<String, Float>();
    
    for(String bin: bins) {
      if(bin.equals(timeUnit)) {
        distribution.put(bin, 1.0f);
      } else {
        distribution.put(bin, 0.0f);
      }
    }
    
    return distribution;
  }
  
  /**
   * Convert duration distribution multiset to a format that's easy to parse automatically.
   * Format: <sign/symptom>, <time bin>:<count>, ...
   * Example: apnea, second:5, minute:1, hour:5, day:10, week:1, month:0, year:0
   */
  public static String formatDistribution(
      String mentionText, 
      Multiset<String> durationDistribution, 
      String separator,
      boolean normalize) {
    
    List<String> distribution = new LinkedList<String>();
    distribution.add(mentionText);


    double total = 0;
    if(normalize) {
      for(String bin : bins) {
        total += durationDistribution.count(bin);
      }
    }
    
    for(String bin : bins) {
      if(normalize) {
        distribution.add(String.format("%s:%.3f", bin, durationDistribution.count(bin) / total));  
      } else {
        distribution.add(String.format("%s:%d", bin, durationDistribution.count(bin)));
      }
      
    }
    
    Joiner joiner = Joiner.on(separator);
    return joiner.join(distribution);
  }
  
  /** 
   * Get relation context.
   */
  public static String getTextBetweenAnnotations(JCas jCas, Annotation arg1, Annotation arg2) {


    final int windowSize = 5;
    String text = jCas.getDocumentText();


    int leftArgBegin = Math.min(arg1.getBegin(), arg2.getBegin());
    int rightArgEnd = Math.max(arg1.getEnd(), arg2.getEnd());
    int begin = Math.max(0, leftArgBegin - windowSize);
    int end = Math.min(text.length(), rightArgEnd + windowSize); 


    return text.substring(begin, end).replaceAll("[\r\n]", " ");
  }


  /**
   * Lemmatize word using ClearNLP lemmatizer.
   */
  public static String lemmatize(String word, String pos) throws IOException {
    
    final String ENG_LEMMATIZER_DATA_FILE = "org/apache/ctakes/dependency/parser/models/lemmatizer/dictionary-1.3.1.jar";
    AbstractMPAnalyzer lemmatizer;
    InputStream lemmatizerModel = FileLocator.getAsStream(ENG_LEMMATIZER_DATA_FILE);
    lemmatizer = EngineGetter.getMPAnalyzer(AbstractReader.LANG_EN, lemmatizerModel);
    String lemma = lemmatizer.getLemma(word, pos);
    lemmatizerModel.close();


    return lemma;
  }
  
  /**
   * Return system generated POS tag or null if none available.
   */
  public static String getPosTag(JCas systemView, Annotation annotation) {
    
    List<BaseToken> coveringBaseTokens = JCasUtil.selectCovered(
        systemView,
        BaseToken.class,
        annotation.getBegin(),
        annotation.getEnd());
    
    if(coveringBaseTokens.size() < 1) {
      return null;
    }
    
    return coveringBaseTokens.get(0).getPartOfSpeech();
  }
  
  /**
   * Keep UMLS concepts and non-verbs intact. Lemmatize verbs.
   * Lowercase before returning.
   */
  public static String normalizeEventText(JCas jCas, Annotation annotation) 
      throws AnalysisEngineProcessException {


    JCas systemView;
    try {
      systemView = jCas.getView("_InitialView");
    } catch (CASException e) {
      throw new AnalysisEngineProcessException(e);
    }


    List<EventMention> coveringSystemEventMentions = JCasUtil.selectCovered(
        systemView, 
        EventMention.class, 
        annotation.getBegin(), 
        annotation.getEnd());
    for(EventMention systemEventMention : coveringSystemEventMentions) {
      if(systemEventMention.getTypeID() != 0) {
        return annotation.getCoveredText().toLowerCase();
      }
    } 
    
    String pos = Utils.getPosTag(systemView, annotation);
    if(pos == null) {
      return annotation.getCoveredText().toLowerCase();
    }


    String text;
    if(pos.startsWith("V")) {
      try {
        text = Utils.lemmatize(annotation.getCoveredText().toLowerCase(), pos);
      } catch (IOException e) {
        System.out.println("couldn't lemmatize: " + annotation.getCoveredText());
        e.printStackTrace();
        return annotation.getCoveredText().toLowerCase();
      }
    } else {
      text = annotation.getCoveredText();
    }
    
    return text.toLowerCase();
  }
  
  /**
   * Read event duration distributions from file.
   */
  public static class Callback implements LineProcessor <Map<String, Map<String, Float>>> {


    // map event text to its duration distribution
    private Map<String, Map<String, Float>> textToDistribution;


    public Callback() {
      textToDistribution = new HashMap<String, Map<String, Float>>();
    }


    public boolean processLine(String line) throws IOException {


      String[] elements = line.split(", "); // e.g. pain, second:0.000, minute:0.005, hour:0.099, ...
      Map<String, Float> distribution = new HashMap<String, Float>();


      for(int durationBinNumber = 1; durationBinNumber < elements.length; durationBinNumber++) {
        String[] durationAndValue = elements[durationBinNumber].split(":"); // e.g. "day:0.475"
        distribution.put(durationAndValue[0], Float.parseFloat(durationAndValue[1]));
      }


      textToDistribution.put(elements[0], distribution);
      return true;
    }


    public Map<String, Map<String, Float>> getResult() {


      return textToDistribution;
    }
  }
  
  /**
   * Instantiate an XMI collection reader.
   */
  public static CollectionReader getCollectionReader(List<File> inputFiles) throws Exception {


    List<String> fileNames = new ArrayList<>();
    for(File file : inputFiles) {
      if(! (file.isHidden())) {
        fileNames.add(file.getPath());
      }
    }


    String[] paths = new String[fileNames.size()];
    fileNames.toArray(paths);


    return CollectionReaderFactory.createCollectionReader(
        XMIReader.class,
        XMIReader.PARAM_FILES,
        paths);
  }


  /**
   * Get files for specific sets of patients.
   * Useful for selecting e.g. only training files.
   */
  public static List<File> getFilesFor(List<Integer> patientSets, File inputDirectory) {


    List<File> files = new ArrayList<>();


    for (Integer set : patientSets) {
      final int setNum = set;
      for (File file : inputDirectory.listFiles(new FilenameFilter(){
        @Override
        public boolean accept(File dir, String name) {
          return name.contains(String.format("ID%03d", setNum));
        }})) {
        // skip hidden files like .svn
        if (!file.isHidden()) {
          files.add(file);
        } 
      }
    }


    return files;
  }
  
  /**
   * Output label and list of cleartk features to a file for debugging.
   */
  public static void writeInstance(String label, List<Feature> features, String fileName) {
    
    StringBuffer output = new StringBuffer(label);
    for(Feature feature : features) {
      if(feature.getName() == null || feature.getValue() == null) {
        continue;
      }
      String name = feature.getName();
      Object value = feature.getValue();
      String nameValuePair;
      if(value instanceof String) {
        String cleanedUpName = name.replace(",", "COMMA").replace(":", "COLON").replace("\n", "EOL");
        String cleanedUpValue = value.toString().replace(",", "COMMA").replace(":", "COLON").replace("\n", "EOL");
        nameValuePair = String.format(",%s-%s:%s", cleanedUpName, cleanedUpValue, 1);
      } else if(value instanceof Integer) {
        String cleanedUpName = name.replace(",", "COMMA").replace(":", "COLON").replace("\n", "EOL");
        String cleanedUpValue = value.toString().replace(",", "COMMA").replace(":", "COLON").replace("\n", "EOL");
        nameValuePair = String.format(",%s:%s", cleanedUpName, cleanedUpValue);
      } else {
        continue;
      }
      output.append(nameValuePair);
    }
    try {
      Files.append(output + "\n", new File(fileName), Charsets.UTF_8);
    } catch (IOException e) {
      System.err.println("could not write to output file!");
    }
  }
  
  public static void main(String[] args) {
    
    HashSet<String> timeUnits = getTimeUnits("three months");
    System.out.println(timeUnits);
  }
}
Source Code of org.apache.ctakes.temporal.duration.Utils$Callback

Related Classes of org.apache.ctakes.temporal.duration.Utils$Callback