Package org.apache.ctakes.core.fsm.machine

Source Code of org.apache.ctakes.core.fsm.machine.DateFSM

/**
* Licensed to the Apache Software Foundation (ASF) under one
* or more contributor license agreements.  See the NOTICE file
* distributed with this work for additional information
* regarding copyright ownership.  The ASF licenses this file
* to you under the Apache License, Version 2.0 (the
* "License"); you may not use this file except in compliance
* with the License.  You may obtain a copy of the License at
*
*   http://www.apache.org/licenses/LICENSE-2.0
*
* Unless required by applicable law or agreed to in writing,
* software distributed under the License is distributed on an
* "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
* KIND, either express or implied.  See the License for the
* specific language governing permissions and limitations
* under the License.
*/
package org.apache.ctakes.core.fsm.machine;

import java.util.HashMap;
import java.util.HashSet;
import java.util.Iterator;
import java.util.List;
import java.util.Map;
import java.util.Set;

import org.apache.ctakes.core.fsm.condition.CombineCondition;
import org.apache.ctakes.core.fsm.condition.IntegerRangeCondition;
import org.apache.ctakes.core.fsm.condition.NegateCondition;
import org.apache.ctakes.core.fsm.condition.PunctuationValueCondition;
import org.apache.ctakes.core.fsm.condition.WordSetCondition;
import org.apache.ctakes.core.fsm.output.DateToken;
import org.apache.ctakes.core.fsm.state.NamedState;
import org.apache.ctakes.core.fsm.state.NonTerminalEndState;
import org.apache.ctakes.core.fsm.token.BaseToken;

import net.openai.util.fsm.AnyCondition;
import net.openai.util.fsm.Condition;
import net.openai.util.fsm.Machine;
import net.openai.util.fsm.State;

/**
* Uses one or more finite state machines to detect dates in the given input of
* tokens.
*
* @author Mayo Clinic
*/
public class DateFSM {
  // constants
  private final int MIN_MONTH = 1;
  private final int MAX_MONTH = 12;
  private final int MIN_DAY = 1;
  private final int MAX_DAY = 31;
  private final int MIN_YEAR = 1;
  private final int MAX_YEAR = 2999;

  // month names in FULL and SHORT formats
  private Set<String> iv_monthFullNameSet = new HashSet<String>();
  private Set<String> iv_monthShortNameSet = new HashSet<String>();

  // contains the finite state machines
  private Set<Machine> iv_machineSet = new HashSet<Machine>();

  /**
   *
   * Constructor
   *
   */
  public DateFSM() {
    iv_monthFullNameSet.add("january");
    iv_monthFullNameSet.add("february");
    iv_monthFullNameSet.add("march");
    iv_monthFullNameSet.add("april");
    iv_monthFullNameSet.add("may");
    iv_monthFullNameSet.add("june");
    iv_monthFullNameSet.add("july");
    iv_monthFullNameSet.add("august");
    iv_monthFullNameSet.add("september");
    iv_monthFullNameSet.add("october");
    iv_monthFullNameSet.add("november");
    iv_monthFullNameSet.add("december");

    iv_monthShortNameSet.add("jan");
    iv_monthShortNameSet.add("feb");
    iv_monthShortNameSet.add("mar");
    iv_monthShortNameSet.add("apr");
    iv_monthShortNameSet.add("may");
    iv_monthShortNameSet.add("jun");
    iv_monthShortNameSet.add("jul");
    iv_monthShortNameSet.add("aug");
    iv_monthShortNameSet.add("sep");
    iv_monthShortNameSet.add("sept");
    iv_monthShortNameSet.add("oct");
    iv_monthShortNameSet.add("nov");
    iv_monthShortNameSet.add("dec");

    iv_machineSet.add(getNumericDateMachine());
    iv_machineSet.add(getTextualDateMachine());
  }

  /**
   * Gets a finite state machine that detects the following:
   * <ol>
   * <li>10/15/2002</li>
   * <li>10/15</li>
   * <li>10-15-2002</li>
   * <li>10-15</li>
   * </ol>
   *
   * @return
   */
  private Machine getNumericDateMachine() {
    State startState = new NamedState("START");
    State endState = new NamedState("END");
    State ntEndState = new NonTerminalEndState("NON TERMINAL END");
    endState.setEndStateFlag(true);
    ntEndState.setEndStateFlag(true);

    Machine machine = new Machine(startState);
    State monthNumState = new NamedState("MONTH_NUM");
    State monthDaySeparatorState = new NamedState("MONTH_DAY_SEP");
    State dayNumState = new NamedState("DAY_NUM");
    State dayYearSeparatorState = new NamedState("DAY_YEAR_SEP");

    Condition monthNumCondition = new IntegerRangeCondition(MIN_MONTH,
        MAX_MONTH);
    Condition mdFslashCondition = new PunctuationValueCondition('/');
    Condition mdDashCondition = new PunctuationValueCondition('-');
    Condition dyFslashCondition = new PunctuationValueCondition('/');
    Condition dyDashCondition = new PunctuationValueCondition('-');
    Condition dayNumCondition = new IntegerRangeCondition(MIN_DAY, MAX_DAY);
    Condition yearNumCondition = new IntegerRangeCondition(MIN_YEAR,
        MAX_YEAR);

    // condition that is not a fslash or dash
    Condition notFslashNotDashCondition = new CombineCondition(
        new NegateCondition(mdFslashCondition), new NegateCondition(
            mdDashCondition));

    startState.addTransition(monthNumCondition, monthNumState);
    startState.addTransition(new AnyCondition(), startState);

    monthNumState.addTransition(mdFslashCondition, monthDaySeparatorState);
    monthNumState.addTransition(mdDashCondition, monthDaySeparatorState);
    monthNumState.addTransition(new AnyCondition(), startState);

    monthDaySeparatorState.addTransition(dayNumCondition, dayNumState);
    monthDaySeparatorState.addTransition(new AnyCondition(), startState);

    dayNumState.addTransition(dyFslashCondition, dayYearSeparatorState);
    dayNumState.addTransition(dyDashCondition, dayYearSeparatorState);
    dayNumState.addTransition(notFslashNotDashCondition, ntEndState);
    dayNumState.addTransition(new AnyCondition(), startState);

    dayYearSeparatorState.addTransition(yearNumCondition, endState);
    dayYearSeparatorState.addTransition(new AnyCondition(), startState);

    endState.addTransition(new AnyCondition(), startState);
    ntEndState.addTransition(new AnyCondition(), startState);

    return machine;
  }

  /**
   * Gets a finite state machine that detects the following:
   * <ol>
   * <li>October 15, 2002</li>
   * <li>October 15 2002</li>
   * <li>October 15</li>
   * <li>October 2002</li>
   * <li>Oct 15, 2002</li>
   * <li>Oct 15 2002</li>
   * <li>Oct 15</li>
   * <li>Oct 2002</li>
   * <li>Oct. 15, 2002</li>
   * <li>Oct. 15 2002</li>
   * <li>Oct. 15</li>
   * <li>Oct. 2002</li>
   * </ol>
   *
   * @return
   */
  private Machine getTextualDateMachine() {
    State startState = new NamedState("START");
    State endState = new NamedState("END");
    State ntEndState = new NonTerminalEndState("NON TERMINAL END");
    endState.setEndStateFlag(true);
    ntEndState.setEndStateFlag(true);

    Machine m = new Machine(startState);

    State monthFullTextState = new NamedState("MONTH_FULL_TEXT");
    State monthShortTextState = new NamedState("MONTH_SHORT_TEXT");
    State dayNumState = new NamedState("DAY_NUM");
    State commaState = new NamedState("COMMA");
    State periodState = new NamedState("PERIOD");

    Condition monthFullTextCondition = new WordSetCondition(
        iv_monthFullNameSet, false);
    Condition monthShortTextCondition = new WordSetCondition(
        iv_monthShortNameSet, false);
    Condition dayNumCondition = new IntegerRangeCondition(MIN_DAY, MAX_DAY);
    Condition periodCondition = new PunctuationValueCondition('.');
    Condition yearNumCondition = new IntegerRangeCondition(MIN_YEAR,
        MAX_YEAR);
    Condition commaCondition = new PunctuationValueCondition(',');

    // condition for not being a comma or year number
    Condition notCommaNotYearNumCondition = new CombineCondition(
        new NegateCondition(commaCondition), new NegateCondition(
            yearNumCondition));

    // condition for being a year num and not a day num
    Condition yearNotDayNumCondition = new CombineCondition(
        yearNumCondition, new NegateCondition(dayNumCondition));

    startState.addTransition(monthFullTextCondition, monthFullTextState);
    startState.addTransition(monthShortTextCondition, monthShortTextState);
    startState.addTransition(new AnyCondition(), startState);

    monthFullTextState.addTransition(dayNumCondition, dayNumState);
    monthFullTextState.addTransition(yearNotDayNumCondition, endState);
    monthFullTextState.addTransition(new AnyCondition(), startState);

    monthShortTextState.addTransition(dayNumCondition, dayNumState);
    monthShortTextState.addTransition(periodCondition, periodState);
    monthShortTextState.addTransition(yearNotDayNumCondition, endState);
    monthShortTextState.addTransition(new AnyCondition(), startState);

    periodState.addTransition(dayNumCondition, dayNumState);
    periodState.addTransition(yearNotDayNumCondition, endState);
    periodState.addTransition(new AnyCondition(), startState);

    dayNumState.addTransition(yearNumCondition, endState);
    dayNumState.addTransition(commaCondition, commaState);
    dayNumState.addTransition(notCommaNotYearNumCondition, ntEndState);
    dayNumState.addTransition(new AnyCondition(), startState);

    commaState.addTransition(yearNumCondition, endState);
    commaState.addTransition(new AnyCondition(), startState);

    endState.addTransition(new AnyCondition(), startState);
    ntEndState.addTransition(new AnyCondition(), startState);

    return m;
  }

  /**
   * Executes the finite state machines.
   *
   * @return Set of DateToken objects.
   */
  public Set<DateToken> execute(List<? extends BaseToken> tokens) throws Exception {
    Set<DateToken> dateSet = new HashSet<DateToken>();

    // maps a fsm to a token start index
    // key = fsm , value = token start index
    Map<Machine, Integer> tokenStartMap = new HashMap<Machine, Integer>();

    for (int i = 0; i < tokens.size(); i++) {
      BaseToken token = tokens.get(i);

      Iterator<Machine> machineItr = iv_machineSet.iterator();
      while (machineItr.hasNext()) {
        Machine fsm = machineItr.next();

        fsm.input(token);

        State currentState = fsm.getCurrentState();
        if (currentState.getStartStateFlag()) {
          tokenStartMap.put(fsm, new Integer(i));
        }
        if (currentState.getEndStateFlag()) {
          Object o = tokenStartMap.get(fsm);
          int tokenStartIndex;
          if (o == null) {
            // By default, all machines start with
            // token zero.
            tokenStartIndex = 0;
          } else {
            tokenStartIndex = ((Integer) o).intValue();
            // skip ahead over single token we don't want
            tokenStartIndex++;
          }
          BaseToken endToken = null;
          if (currentState instanceof NonTerminalEndState) {
            endToken = tokens.get(i - 1);
          } else {
            endToken = token;
          }

          BaseToken startToken = tokens
              .get(tokenStartIndex);
          DateToken dateToken = new DateToken(startToken
              .getStartOffset(), endToken.getEndOffset());
          dateSet.add(dateToken);
          fsm.reset();
        }
      }
    }

    // cleanup
    tokenStartMap.clear();

    // reset machines
    Iterator<Machine> itr = iv_machineSet.iterator();
    while (itr.hasNext()) {
      Machine fsm = itr.next();
      fsm.reset();
    }

    return dateSet;
  }

}
TOP

Related Classes of org.apache.ctakes.core.fsm.machine.DateFSM

TOP
Copyright © 2018 www.massapi.com. All rights reserved.
All source code are property of their respective owners. Java is a trademark of Sun Microsystems, Inc and owned by ORACLE Inc. Contact coftware#gmail.com.