Package org.apache.ctakes.core.ci

Source Code of org.apache.ctakes.core.ci.HyphenTextModifierImpl

/**
* Licensed to the Apache Software Foundation (ASF) under one
* or more contributor license agreements.  See the NOTICE file
* distributed with this work for additional information
* regarding copyright ownership.  The ASF licenses this file
* to you under the Apache License, Version 2.0 (the
* "License"); you may not use this file except in compliance
* with the License.  You may obtain a copy of the License at
*
*   http://www.apache.org/licenses/LICENSE-2.0
*
* Unless required by applicable law or agreed to in writing,
* software distributed under the License is distributed on an
* "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
* KIND, either express or implied.  See the License for the
* specific language governing permissions and limitations
* under the License.
*/
/*
* Created on May 23, 2005
*
* To change the template for this generated file go to
* Window>Preferences>Java>Code Generation>Code and Comments
*/
package org.apache.ctakes.core.ci;

import java.io.BufferedReader;
import java.io.File;
import java.io.FileNotFoundException;
import java.io.FileReader;
import java.io.IOException;
import java.io.InputStream;
import java.io.InputStreamReader;
import java.util.ArrayList;
import java.util.HashMap;
import java.util.Iterator;
import java.util.List;
import java.util.Map;

import org.apache.ctakes.core.nlp.tokenizer.Token;
import org.apache.ctakes.core.nlp.tokenizer.Tokenizer;


/**
* @author Mayo Clinic
*
*/
public class HyphenTextModifierImpl implements TextModifier {

  private Map<String, Integer> iv_shouldbeHyphenMap = null;
  private int iv_windowSize = 3; // default lookahead window
  private Tokenizer iv_tokenizer = null;

  /*
   * DECPRECATED: Uses InputSteam instead
   */
  public HyphenTextModifierImpl(String hyphenfilename, int windowSize) {
    iv_windowSize = windowSize;
    iv_tokenizer = new Tokenizer();
    BufferedReader br;
    try {
      br = new BufferedReader(new FileReader(new File(hyphenfilename)));

      String line = "";

      iv_shouldbeHyphenMap = new HashMap<String, Integer>();
      while ((line = br.readLine()) != null) {
        String[] toks = line.split("\\|");
        String[] unh = toks[0].split("\\-");
        String shouldbehyphen = "";
        for (int i = 0; i < unh.length; i++) {
          shouldbehyphen += " " + unh[i];
        }
        shouldbehyphen = shouldbehyphen.trim().toLowerCase();
        iv_shouldbeHyphenMap.put(shouldbehyphen, new Integer(1));
      }
    } catch (FileNotFoundException e) {
      System.err.println("Cannot find the hyphenation file:" + hyphenfilename);
      e.printStackTrace();
    } catch (IOException e) {
      System.err.println("IOException accessing the hyphenation file:" + hyphenfilename);
      e.printStackTrace();
    }

  }
  /**
   * Default constructor takes a name of the file containing hyphenated
   * phrases, with their frequency.
   * Currently the frequency is unused.<br>
   * The case of the words in the file is unimportant - we lowercase
   * everything when doing compares.<br>
   * The file is delimited with "|" and has two fields:<br>
   * hyphen-term|frequency
   */
  public HyphenTextModifierImpl(InputStream hyphenfilename, int windowSize) {
    iv_windowSize = windowSize;
    iv_tokenizer = new Tokenizer();
    BufferedReader br;
    try {
      br = new BufferedReader(new InputStreamReader(hyphenfilename));
      String line = "";

      iv_shouldbeHyphenMap = new HashMap<String, Integer>();
      while ((line = br.readLine()) != null) {
        String[] toks = line.split("\\|");
        String[] unh = toks[0].split("\\-");
        String shouldbehyphen = "";
        for (int i = 0; i < unh.length; i++) {
          shouldbehyphen += " " + unh[i];
        }
        shouldbehyphen = shouldbehyphen.trim().toLowerCase();
        iv_shouldbeHyphenMap.put(shouldbehyphen, new Integer(1));
      }
    } catch (FileNotFoundException e) {
      System.err.println("Cannot find the hyphenation file:" + hyphenfilename);
      e.printStackTrace();
    } catch (IOException e) {
      System.err.println("IOException accessing the hyphenation file:" + hyphenfilename);
      e.printStackTrace();
    }

  } 

  /**
   * Filters out unwanted tokens - newlines.
   *
   * @param tokenList
   */
  private void filterTokens(List<Token> tokenList) {

    List<Token> removalList = new ArrayList<Token>();
    Iterator<Token> tokenItr = tokenList.iterator();

    while (tokenItr.hasNext()) {
      Token token = tokenItr.next();
      if (token.getType() == Token.TYPE_EOL) {
        removalList.add(token);
      }
    }

    tokenList.removeAll(removalList);
  }

  /*
   * (non-Javadoc)
   *
   * @see edu.mayo.bmi.uima.util.ci.TextModifier#modify(java.lang.String)
   */
  public TextModification[] modify(String in) throws Exception {

    // intermediate data structure to use for easy adding of new
    // TextModification objects
    ArrayList<TextModification> textmods = new ArrayList<TextModification>();

    // Tokenize the input to get offset information
    List<Token> inputtoks = iv_tokenizer.tokenizeAndSort(in);

    filterTokens(inputtoks);

    int orig_startOffset = 0;
    int orig_endOffset = 0;
    int new_startOffset = 0;
    int new_endOffset = 0;

    int i = 0;
    int j = 0;
    int end_offset_adj = 0;
    int start_offset_adj = 0;

    while (i < inputtoks.size()) {

      if (inputtoks.size() - (i + 1) < iv_windowSize) {
        j = inputtoks.size() - 1;
      } else {
        j = i + iv_windowSize;
      }

      while (j > i) {

        StringBuffer candSB = new StringBuffer();
        for (int k = i; k <= j; k++) {
          Token currtok = (Token) inputtoks.get(k);
          candSB.append(" ");
          candSB.append(currtok.getText());
        }
        String cand = candSB.toString().trim();

        // Attempt to look up the candidate in the hyphen map
        if (iv_shouldbeHyphenMap.containsKey(cand.toLowerCase())) {

          // set the initial offsets
          orig_startOffset = ((Token) inputtoks.get(i)).getStartOffset();
          orig_endOffset = ((Token) inputtoks.get(j)).getEndOffset();
          new_startOffset = orig_startOffset;
          new_endOffset = orig_endOffset;

          // compile new text
          String newText = "";
          for (int k = i; k <= j; k++) {
            Token currtok = (Token) inputtoks.get(k);
            newText += currtok.getText() + "-";
          }
          newText = newText.substring(0, newText.length() - 1);

          // Get the new and old lengths of hyphenated spans
          int new_Length = newText.length();
          int orig_Length = orig_endOffset - orig_startOffset;

          // Pad the end offset adjuster by the new amount
          end_offset_adj += orig_Length - new_Length;

          // Create a new modification object
          TextModification tm = new TextModification(orig_startOffset, orig_endOffset, new_startOffset
              - start_offset_adj, new_endOffset - end_offset_adj, newText);

          // Adjust the start offset on the next Text Modification
          // object
          start_offset_adj += orig_Length - new_Length;

          // Put the newly created TextMod object into a temporary
          // holder
          textmods.add(tm);

          i = j;
        }
        j--;
      }

      i++;
    }

    // generate the expected return as an array of TextModification objects
    TextModification[] tma = new TextModification[textmods.size()];
    for (int y = 0; y < tma.length; y++) {
      tma[y] = (TextModification) textmods.get(y);
    }

    return tma;
  }

 
    /**
     * Apply text modifier to the text <br>
     * TODO - move this to <code>TextModifier</code> and take a <code>Logger</code>
     *     See <code>HyphenTextModifierImpl</code>
   * @param tm TextModifier to apply
   * @param text Original text
   * @param sb Buffer containing text to apply modifier to
     * @return unableToModifyText true if modifier would require offset changes, which is not supported by this method
   * @throws Exception
     */
    private static boolean applyTextModifier(TextModifier tm, String text, StringBuffer sb) throws Exception {
      boolean unableToModifyText = false;
        TextModification[] textModArr = tm.modify(text);
        for (int i = 0; i < textModArr.length; i++) {

          TextModification textMod = textModArr[i];
           
            if ((textMod.getOrigStartOffset() != textMod.getNewStartOffset())
                    || (textMod.getOrigEndOffset() != textMod.getNewEndOffset())) {
                System.err.println("UNSUPPORTED: TextModification with offset changes.");
                unableToModifyText = true;
            }
            else {
              sb.replace(textMod.getOrigStartOffset(),
                textMod.getOrigEndOffset(),
                textMod.getNewText());
            }
        } 
        return unableToModifyText;
    }
 
    public static ArrayList<String> test(HyphenTextModifierImpl tm, String text) {
      ArrayList<String> messages = new ArrayList<String>();
      try {
      TextModification[] tma = tm.modify(text);
      StringBuffer sb = new StringBuffer(text);
      boolean errorModifyingText = applyTextModifier(tm,text,sb);
      messages.add("Orig: " + text);
      if (!errorModifyingText) {
        messages.add("New:  " + sb);
      }
      else {
        System.err.println("New:  (new text not generated, see previous messages)");       
      }
      // Regardless of whether was able to modify the text
      // without
      // (_apply_ the TextModifier), output the 
      // the
      for (int u = 0; u < tma.length; u++) {
        TextModification tmo = (TextModification) tma[u];
        messages.add(tmo.getNewText() + " Orig: " + tmo.getOrigStartOffset() + "-"
            + tmo.getOrigEndOffset() + " New: " + tmo.getNewStartOffset() + "-" + tmo.getNewEndOffset());
      }
    } catch (Exception e) {
      // TODO Auto-generated catch block
      e.printStackTrace();
    }
    return messages;
     
    }
  /**
   * Simple tests of <code>TextModification</code>
   * <br>
   * Output expected:<br>
   *     UNSUPPORTED: TextModification with offset changes.<br>
   *     UNSUPPORTED: TextModification with offset changes.<br>
   *     UNSUPPORTED: TextModification with offset changes.<br>
   *      Orig: Non  Hodgkin's the x  ray without any non small  cell complications.<br>
   *      New:  (new text not generated, see previous messages)
   *     Non-Hodgkin Orig: 0-12 New: 0-11<br>
   *     x-ray Orig: 19-25 New: 18-23<br>
   *     non-small-cell Orig: 38-53 New: 36-50<br>
   *
   *     Orig: Non Hodgkin's the x ray without any non small cell complications.<br>
   *     New:  Non-Hodgkin's the x-ray without any non-small-cell complications.<br>
   *     Non-Hodgkin Orig: 0-11 New: 0-11<br>
   *     x-ray Orig: 18-23 New: 18-23<br>
   *     non-small-cell Orig: 36-50 New: 36-50<br>
   * Note the case of the words doesn't matter.
   * @param args hyphen text filename (each line: hyphenated-word|freq)
   */
  public static void main(String[] args) {
    ArrayList<String> messages;
    HyphenTextModifierImpl tm = new HyphenTextModifierImpl(args[0], 7);

    String t = "Non  Hodgkin's the x  ray without any non small  cell complications.";
    messages = test(tm, t); // extra blanks
    for (String s : messages) {  System.out.println(s); }

    t = t.replace("  ", " "); // change text to only have single blanks between words
    messages = test(tm, t); // single blanks
    for (String s : messages) {  System.out.println(s); }
  }

}
TOP

Related Classes of org.apache.ctakes.core.ci.HyphenTextModifierImpl

TOP
Copyright © 2018 www.massapi.com. All rights reserved.
All source code are property of their respective owners. Java is a trademark of Sun Microsystems, Inc and owned by ORACLE Inc. Contact coftware#gmail.com.