Source Code of org.apache.ctakes.core.sentence.SentenceDetectorCtakes

/**
 * Licensed to the Apache Software Foundation (ASF) under one
 * or more contributor license agreements.  See the NOTICE file
 * distributed with this work for additional information
 * regarding copyright ownership.  The ASF licenses this file
 * to you under the Apache License, Version 2.0 (the
 * "License"); you may not use this file except in compliance
 * with the License.  You may obtain a copy of the License at
 *
 *   http://www.apache.org/licenses/LICENSE-2.0
 *
 * Unless required by applicable law or agreed to in writing,
 * software distributed under the License is distributed on an
 * "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
 * KIND, either express or implied.  See the License for the
 * specific language governing permissions and limitations
 * under the License.
 */
package org.apache.ctakes.core.sentence;




import java.io.File;
import java.io.FileInputStream;
import java.io.FileOutputStream;
import java.io.IOException;
import java.io.InputStreamReader;
import java.util.ArrayList;
import java.util.HashMap;
import java.util.List;
import java.util.Map;


import opennlp.maxent.GIS;
import opennlp.maxent.GISModel;
import opennlp.model.EventStream;
import opennlp.model.MaxentModel;
import opennlp.tools.dictionary.Dictionary;
import opennlp.tools.sentdetect.EndOfSentenceScanner;
import opennlp.tools.sentdetect.SDContextGenerator;
import opennlp.tools.sentdetect.SDEventStream;
import opennlp.tools.sentdetect.SentenceDetector;
import opennlp.tools.sentdetect.SentenceModel;
import opennlp.tools.sentdetect.SentenceSample;
import opennlp.tools.sentdetect.SentenceSampleStream;
import opennlp.tools.sentdetect.lang.Factory;
import opennlp.tools.util.HashSumEventStream;
import opennlp.tools.util.ObjectStream;
import opennlp.tools.util.PlainTextByLineStream;
import opennlp.tools.util.Span;
import opennlp.tools.util.StringUtil;
import opennlp.tools.util.model.BaseModel;
import opennlp.tools.util.model.ModelUtil;


/**
 * A sentence detector for splitting up raw text into sentences.
 * <p>
 * A maximum entropy model is used to evaluate the characters ".", "!", and "?" in a
 * string to determine if they signify the end of a sentence.
 * 
 * @see  opennlp.tools.sentdetect.SentenceDetectorME in OpenNLP 1.5
 */
public class SentenceDetectorCtakes {


    /**
     * Constant indicates a sentence split.
     */
    public static final String SPLIT ="s";


    /**
     * Constant indicates no sentence split.
     */
    public static final String NO_SPLIT ="n";
    
    private static final Double ONE = new Double(1);


    /**
     * The maximum entropy model to use to evaluate contexts.
     */
    private MaxentModel model;


    /**
     * The feature context generator.
     */
    private final SDContextGenerator cgen;


    /**
     * The {@link EndOfSentenceScanner} to use when scanning for end of sentence offsets.
     */
    private final EndOfSentenceScanner scanner;


    /**
     * The list of probabilities associated with each decision.
     */
    private List<Double> sentProbs = new ArrayList<Double>();


    protected boolean useTokenEnd;


    /**
     * Initializes the current instance.
     *
     * @param model the {@link SentenceModel}
     */
    public SentenceDetectorCtakes(MaxentModel model, SDContextGenerator cg, EndOfSentenceScanner eoss) {
      this.model = model;
      cgen = cg;
      scanner = eoss;
      useTokenEnd = false; // TODO
    }




    /**
     * Detect sentences in a String.
     *
     * @param s  The string to be processed.
     *
     * @return   A string array containing individual sentences as elements.
     */
    public String[] sentDetect(String s) {
      int[] endsOfSentences = sentPosDetect(s);
      String sentences[];
      if (endsOfSentences.length != 0) {


        sentences = new String[endsOfSentences.length];


        int begin = 0;
        for (int si = 0; si < endsOfSentences.length; si++) {
          sentences[si] = s.substring(begin, endsOfSentences[si]+1);
          begin = endsOfSentences[si]+1;
        }
      }
      else {
        sentences = new String[] {};
      }
      return sentences;
    }


    private int getFirstWS(String s, int pos) {
      while (pos < s.length() && !StringUtil.isWhitespace(s.charAt(pos)))
        pos++;
      return pos;
    }


    private int getFirstNonWS(String s, int pos) {
      while (pos < s.length() && StringUtil.isWhitespace(s.charAt(pos)))
        pos++;
      return pos;
    }


    /**
     * Detect the position of the first words of sentences in a String.
     *
     * @param s  The string to be processed.
     * @return   A integer array containing the positions of the end index of
     *          every sentence
     *
     * @see SentenceDetectorME#sentPosDetect(String)  
     */
    public int[] sentPosDetect(String s) { // return int[] to be line OpenNLP 1.4
      double sentProb = 1;
      sentProbs.clear();
      StringBuffer sb = new StringBuffer(s);
      List<Integer> enders = scanner.getPositions(s);
      List<Integer> positions = new ArrayList<Integer>(enders.size());


      for (int i = 0, end = enders.size(), index = 0; i < end; i++) {
        Integer candidate = enders.get(i);
        int cint = candidate;
        // skip over the leading parts of non-token final delimiters
        int fws = getFirstWS(s,cint + 1);
        if (i + 1 < end && enders.get(i + 1) < fws) {
          continue;
        }


        double[] probs = model.eval(cgen.getContext(sb, cint));
        String bestOutcome = model.getBestOutcome(probs);
        sentProb *= probs[model.getIndex(bestOutcome)];


        if (bestOutcome.equals(SPLIT) && isAcceptableBreak(s, index, cint)) {
          if (index != cint) {
            if (useTokenEnd) {
              positions.add(getFirstNonWS(s, getFirstWS(s,cint + 1)));
            }
            else {
              positions.add(getFirstNonWS(s,cint));
            }
            sentProbs.add(new Double(probs[model.getIndex(bestOutcome)]));
          }
          index = cint + 1;
        }
      }


      int[] sentenceBreaks = new int[positions.size()];
      for (int i = 0; i < sentenceBreaks.length; i++) {
        sentenceBreaks[i] = positions.get(i)+1;
      }


      return sentenceBreaks;
      
    }


    /**
     * Returns the probabilities associated with the most recent
     * calls to sentDetect().
     *
     * @return probability for each sentence returned for the most recent
     * call to sentDetect.  If not applicable an empty array is
     * returned.
     */
    public double[] getSentenceProbabilities() {
      double[] sentProbArray = new double[sentProbs.size()];
      for (int i = 0; i < sentProbArray.length; i++) {
        sentProbArray[i] = ((Double) sentProbs.get(i)).doubleValue();
      }
      return sentProbArray;
    }


    /**
     * Allows subclasses to check an overzealous (read: poorly
     * trained) model from flagging obvious non-breaks as breaks based
     * on some boolean determination of a break's acceptability.
     *
     * <p>The implementation here always returns true, which means
     * that the MaxentModel's outcome is taken as is.</p>
     *
     * @param s the string in which the break occurred.
     * @param fromIndex the start of the segment currently being evaluated
     * @param candidateIndex the index of the candidate sentence ending
     * @return true if the break is acceptable
     */
    protected boolean isAcceptableBreak(String s, int fromIndex, int candidateIndex) {
      return true;
    }
    
    public static SentenceModel train(String languageCode, ObjectStream<SentenceSample> samples,
        boolean useTokenEnd, Dictionary abbreviations) throws IOException {
      return train(languageCode, samples, useTokenEnd, abbreviations,5,100);
    }
    
    public static SentenceModel train(String languageCode, ObjectStream<SentenceSample> samples,
        boolean useTokenEnd, Dictionary abbreviations, int cutoff, int iterations) throws IOException {


      Map<String, String> manifestInfoEntries = new HashMap<String, String>();
      ModelUtil.addCutoffAndIterations(manifestInfoEntries, cutoff, iterations);
      
      Factory factory = new Factory();


      // TODO: Fix the EventStream to throw exceptions when training goes wrong
      EventStream eventStream = new SDEventStream(samples,
          factory.createSentenceContextGenerator(languageCode),
          factory.createEndOfSentenceScanner(languageCode));
      
      HashSumEventStream hses = new HashSumEventStream(eventStream);
      GISModel sentModel = GIS.trainModel(hses, iterations, cutoff);


      manifestInfoEntries.put(BaseModel.TRAINING_EVENTHASH_PROPERTY, 
          hses.calculateHashSum().toString(16));
      
      return new SentenceModel(languageCode, sentModel,
          useTokenEnd, abbreviations, manifestInfoEntries);
    }


    private static void usage() {
      System.err.println("Usage: SentenceDetectorME -encoding charset -lang language trainData modelName [cutoff iterations]");
      System.err.println("-encoding charset specifies the encoding which should be used ");
      System.err.println("                  for reading and writing text.");
      System.err.println("-lang language    specifies the language which ");
      System.err.println("                  is being processed.");
      System.err.println("trainData         specifies the name of the input training file");
      System.err.println("                  to train the resulting model.");
      System.err.println("modelName         specifies the resulting saved model after");
      System.err.println("                  training.");
      System.exit(1);
    }


    /**
     * <p>Trains a new sentence detection model.</p>
     *
     * <p>Usage: opennlp.tools.sentdetect.SentenceDetectorME data_file new_model_name (iterations cutoff)?</p>
     *
     * @param args
     * @throws IOException
     */
    public static void main(String[] args) throws IOException {
      int ai=0;
      String encoding = null;
      String lang = null;
      if (args.length == 0) {
        usage();
      }
      while (args[ai].startsWith("-")) {
        if (args[ai].equals("-encoding")) {
          ai++;
          if (ai < args.length) {
            encoding = args[ai];
            ai++;
          }
          else {
            usage();
          }
        }
        else if (args[ai].equals("-lang")) {
          ai++;
          if (ai < args.length) {
            lang = args[ai];
            ai++;
          }
          else {
            usage();
          }
        }
        else {
          usage();
        }
      }


      File inFile = new File(args[ai++]);
      File outFile = new File(args[ai++]);


      int numberOfArgs = args.length;
      int iters = (ai < numberOfArgs ? convertToInt(args[ai++]) : 100);
      int cutoff = (ai < numberOfArgs ? convertToInt(args[ai++]) : 4);




      try {
        if ((lang == null) || (encoding == null)) {
          usage();
        }


        
        SentenceModel model = train(lang, new SentenceSampleStream(new PlainTextByLineStream(
            new InputStreamReader(new FileInputStream(inFile), encoding))), true, null, cutoff, iters);


        // TODO: add support for iterations and cutoff settings


//        if (args.length > ai)
//          mod = train(es, Integer.parseInt(args[ai++]), Integer.parseInt(args[ai++]));
//        else
//          mod = train(es, 100, 5);


        System.out.println("Saving the model as: " + outFile);
        model.serialize(new FileOutputStream(outFile));
      }
      catch (Exception e) {
        e.printStackTrace();
      }
    }




  private static int convertToInt(String s) {


    int i = Integer.parseInt(s); 
    return i;
  }
  
}
Source Code of org.apache.ctakes.core.sentence.SentenceDetectorCtakes

Related Classes of org.apache.ctakes.core.sentence.SentenceDetectorCtakes