Package edu.stanford.nlp.time

Source Code of edu.stanford.nlp.time.HeidelTimeAnnotator

package edu.stanford.nlp.time;

import edu.stanford.nlp.io.IOUtils;
import edu.stanford.nlp.io.RuntimeIOException;
import edu.stanford.nlp.ling.CoreAnnotations;
import edu.stanford.nlp.ling.CoreLabel;
import edu.stanford.nlp.pipeline.Annotation;
import edu.stanford.nlp.pipeline.Annotator;
import edu.stanford.nlp.time.TimeAnnotations;
import edu.stanford.nlp.util.ArrayCoreMap;
import edu.stanford.nlp.util.CoreMap;
import edu.stanford.nlp.util.DataFilePaths;
import edu.stanford.nlp.util.Generics;
import edu.stanford.nlp.util.SystemUtils;
import org.w3c.dom.Element;
import org.w3c.dom.Node;
import org.w3c.dom.NodeList;
import org.w3c.dom.Text;

import java.io.File;
import java.io.IOException;
import java.io.PrintWriter;
import java.io.StringWriter;
import java.util.*;
import java.util.regex.Pattern;


/**
* Annotates text using HeidelTime
*
* GUTIME/TimeML specifications can be found at:
* <a href="http://www.timeml.org/site/tarsqi/modules/gutime/index.html">
* http://www.timeml.org/site/tarsqi/modules/gutime/index.html</a>.
*
* TODO heideltime doesn't actually run on the NLP machines :( (TreeTagger doesn't run)
* @author Gabor Angeli
*/
public class HeidelTimeAnnotator implements Annotator {

  private static final String BASE_PATH = "$NLP_DATA_HOME/packages/heideltime/";
  private static final String DEFAULT_PATH = DataFilePaths.convert(BASE_PATH);
  private final File heideltimePath;
  private final boolean outputResults;

  // if used in a pipeline or constructed with a Properties object,
  // this property tells the annotator where to find the script
  public static final String HEIDELTIME_PATH_PROPERTY = "heideltime.path";
  public static final String HEIDELTIME_OUTPUT_RESULTS = "heideltime.outputResults";

  public HeidelTimeAnnotator() {
    this(new File(System.getProperty("heideltime", DEFAULT_PATH)));
  }

  public HeidelTimeAnnotator(File heideltimePath) {
    this.heideltimePath = heideltimePath;
    this.outputResults = false;
  }

  public HeidelTimeAnnotator(String name, Properties props) {
    String path = props.getProperty(HEIDELTIME_PATH_PROPERTY,
                                    System.getProperty("heideltime",
                                                       DEFAULT_PATH));
    this.heideltimePath = new File(path);

    this.outputResults =
      Boolean.valueOf(props.getProperty(HEIDELTIME_OUTPUT_RESULTS, "false"));
  }

  public void annotate(Annotation annotation) {
    try {
      this.annotate((CoreMap)annotation);
    } catch (IOException e) {
      throw new RuntimeIOException(e);
    }
  }

  public void annotate(CoreMap document) throws IOException {
    //--Create Input File
    //(create file)
    File inputFile = File.createTempFile("heideltime", ".input");
    //(write to file)
    PrintWriter inputWriter = new PrintWriter(inputFile);
    inputWriter.println(document.get(CoreAnnotations.TextAnnotation.class));
    inputWriter.close();

    //--Get Date
    //(error checks)
    if(!document.has(CoreAnnotations.CalendarAnnotation.class) && !document.has(CoreAnnotations.DocDateAnnotation.class)){
      throw new IllegalArgumentException("CoreMap must have either a Calendar or DocDate annotation"); //not strictly necessary, technically...
    }
    //(variables)
    Calendar dateCalendar = document.get(CoreAnnotations.CalendarAnnotation.class);
    String pubDate = null;
    if (dateCalendar != null) {
      //(case: calendar annotation)
      pubDate = String.format("%TF", dateCalendar);
    } else {
      //(case: docdateannotation)
      String s = document.get(CoreAnnotations.DocDateAnnotation.class);
      if (s != null) {
        pubDate = s;
      }
    }

    //--Build Command
    ArrayList<String> args = new ArrayList<String>();
    args.add("java");
    args.add("-jar"); args.add(this.heideltimePath.getPath() + "/heideltime.jar");
    args.add("-t"); args.add("NEWS");
    if(pubDate != null){
      args.add("-dct"); args.add(pubDate);
    }
    args.add(inputFile.getPath());
    // run HeidelTime on the input file
    ProcessBuilder process = new ProcessBuilder(args);

    StringWriter outputWriter = new StringWriter();
    SystemUtils.run(process, outputWriter, null);
    String output = outputWriter.getBuffer().toString();
    Pattern docClose = Pattern.compile("</DOC>.*", Pattern.DOTALL);
    output = docClose.matcher(output).replaceAll("</DOC>").replaceAll("<!DOCTYPE TimeML SYSTEM \"TimeML.dtd\">",""); //TODO TimeML.dtd? FileNotFoundException if we leave it in
    Pattern badNestedTimex = Pattern.compile(Pattern.quote("<T</TIMEX3>IMEX3"));
    output = badNestedTimex.matcher(output).replaceAll("</TIMEX3><TIMEX3");
    Pattern badNestedTimex2 = Pattern.compile(Pattern.quote("<TI</TIMEX3>MEX3"));
    output = badNestedTimex2.matcher(output).replaceAll("</TIMEX3><TIMEX3");
    output = output.replaceAll("\\n\\n<TimeML>\\n\\n","<TimeML>");

    // parse the HeidelTime output
    Element outputXML;
    try {
      outputXML = XMLUtils.parseElement(output);
    } catch (Exception ex) {
      throw new RuntimeException(String.format("error:\n%s\ninput:\n%s\noutput:\n%s",
          ex, IOUtils.slurpFile(inputFile), output), ex);
    }
    inputFile.delete();
   
    // get Timex annotations
    List<CoreMap> timexAnns = toTimexCoreMaps(outputXML, document);
    document.set(TimeAnnotations.TimexAnnotations.class, timexAnns);
    if (outputResults) {
      System.out.println(timexAnns);
    }
   
    // align Timex annotations to sentences
    int timexIndex = 0;
    for (CoreMap sentence: document.get(CoreAnnotations.SentencesAnnotation.class)) {
      int sentBegin = beginOffset(sentence);
      int sentEnd = endOffset(sentence);
     
      // skip times before the sentence
      while (timexIndex < timexAnns.size() && beginOffset(timexAnns.get(timexIndex)) < sentBegin) {
        ++timexIndex;
      }
     
      // determine times within the sentence
      int sublistBegin = timexIndex;
      int sublistEnd = timexIndex;
      while (timexIndex < timexAnns.size() &&
             sentBegin <= beginOffset(timexAnns.get(timexIndex)) &&
             endOffset(timexAnns.get(timexIndex)) <= sentEnd) {
        ++sublistEnd;
        ++timexIndex;
      }
     
      // set the sentence timexes
      sentence.set(TimeAnnotations.TimexAnnotations.class, timexAnns.subList(sublistBegin, sublistEnd));
    }
  }
 
  private static int beginOffset(CoreMap ann) {
    return ann.get(CoreAnnotations.CharacterOffsetBeginAnnotation.class);
  }
 
  private static int endOffset(CoreMap ann) {
    return ann.get(CoreAnnotations.CharacterOffsetEndAnnotation.class);
  }
 
  private static List<CoreMap> toTimexCoreMaps(Element docElem, CoreMap originalDocument) {
    //--Collect Token Offsets
    Map<Integer,Integer> beginMap = Generics.newHashMap();
    Map<Integer,Integer> endMap = Generics.newHashMap();
    boolean haveTokenOffsets = true;
    for(CoreMap sent : originalDocument.get(CoreAnnotations.SentencesAnnotation.class)){
      for(CoreLabel token : sent.get(CoreAnnotations.TokensAnnotation.class)){
        Integer tokBegin = token.get(CoreAnnotations.TokenBeginAnnotation.class);
        Integer tokEnd = token.get(CoreAnnotations.TokenEndAnnotation.class);
        if(tokBegin == null || tokEnd == null){ haveTokenOffsets = false; }
        int charBegin = token.get(CoreAnnotations.CharacterOffsetBeginAnnotation.class);
        int charEnd = token.get(CoreAnnotations.CharacterOffsetEndAnnotation.class);
        beginMap.put(charBegin,tokBegin);
        endMap.put(charEnd,tokEnd);
      }
    }
    List<CoreMap> timexMaps = new ArrayList<CoreMap>();
    int offset = 0;
    NodeList docNodes = docElem.getChildNodes();
    for (int i = 0; i < docNodes.getLength(); i++) {
      Node content = docNodes.item(i);
      if (content instanceof Text) {
        Text text = (Text)content;
        offset += text.getWholeText().length();
      } else if (content instanceof Element) {
        Element child = (Element)content;
        if (child.getNodeName().equals("TIMEX3")) {
          Timex timex = new Timex(child);
          if (child.getChildNodes().getLength() != 1) {
            throw new RuntimeException("TIMEX3 should only contain text " + child);
          }
          String timexText = child.getTextContent();
          CoreMap timexMap = new ArrayCoreMap();
          timexMap.set(TimeAnnotations.TimexAnnotation.class, timex);
          timexMap.set(CoreAnnotations.TextAnnotation.class, timexText);
          int charBegin = offset;
          timexMap.set(CoreAnnotations.CharacterOffsetBeginAnnotation.class, offset);
          offset += timexText.length();
          timexMap.set(CoreAnnotations.CharacterOffsetEndAnnotation.class, offset);
          int charEnd = offset;
          //(tokens)
          if(haveTokenOffsets){
            Integer tokBegin = beginMap.get(charBegin);
            int searchStep = 1;          //if no exact match, search around the character offset
            while(tokBegin == null){
              tokBegin = beginMap.get(charBegin - searchStep);
              if(tokBegin == null){
                tokBegin = beginMap.get(charBegin + searchStep);
              }
              searchStep += 1;
            }
            searchStep = 1;
            Integer tokEnd = endMap.get(charEnd);
            while(tokEnd == null){
              tokEnd = endMap.get(charEnd - searchStep);
              if(tokEnd == null){
                tokEnd = endMap.get(charEnd + searchStep);
              }
              searchStep += 1;
            }
            timexMap.set(CoreAnnotations.TokenBeginAnnotation.class, tokBegin);
            timexMap.set(CoreAnnotations.TokenEndAnnotation.class, tokEnd);
          }
          timexMaps.add(timexMap);
        } else {
          throw new RuntimeException("unexpected element " + child);
        }
      } else {
        throw new RuntimeException("unexpected content " + content);
      }
    }
    return timexMaps;
  }

  @Override
  public Set<Requirement> requires() {
    return TOKENIZE_AND_SSPLIT;
  }

  @Override
  public Set<Requirement> requirementsSatisfied() {
    return Collections.singleton(HEIDELTIME_REQUIREMENT);
  }
}
TOP

Related Classes of edu.stanford.nlp.time.HeidelTimeAnnotator

TOP
Copyright © 2018 www.massapi.com. All rights reserved.
All source code are property of their respective owners. Java is a trademark of Sun Microsystems, Inc and owned by ORACLE Inc. Contact coftware#gmail.com.