Source Code of org.apache.ctakes.core.ci.HyphenTextModifierImpl

/**
 * Licensed to the Apache Software Foundation (ASF) under one
 * or more contributor license agreements.  See the NOTICE file
 * distributed with this work for additional information
 * regarding copyright ownership.  The ASF licenses this file
 * to you under the Apache License, Version 2.0 (the
 * "License"); you may not use this file except in compliance
 * with the License.  You may obtain a copy of the License at
 *
 *   http://www.apache.org/licenses/LICENSE-2.0
 *
 * Unless required by applicable law or agreed to in writing,
 * software distributed under the License is distributed on an
 * "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
 * KIND, either express or implied.  See the License for the
 * specific language governing permissions and limitations
 * under the License.
 */
/*
 * Created on May 23, 2005
 *
 * To change the template for this generated file go to
 * Window>Preferences>Java>Code Generation>Code and Comments
 */
package org.apache.ctakes.core.ci;


import java.io.BufferedReader;
import java.io.File;
import java.io.FileNotFoundException;
import java.io.FileReader;
import java.io.IOException;
import java.io.InputStream;
import java.io.InputStreamReader;
import java.util.ArrayList;
import java.util.HashMap;
import java.util.Iterator;
import java.util.List;
import java.util.Map;


import org.apache.ctakes.core.nlp.tokenizer.Token;
import org.apache.ctakes.core.nlp.tokenizer.Tokenizer;




/**
 * @author Mayo Clinic
 * 
 */
public class HyphenTextModifierImpl implements TextModifier {


  private Map<String, Integer> iv_shouldbeHyphenMap = null;
  private int iv_windowSize = 3; // default lookahead window
  private Tokenizer iv_tokenizer = null;


  /*
   * DECPRECATED: Uses InputSteam instead
   */
  public HyphenTextModifierImpl(String hyphenfilename, int windowSize) {
    iv_windowSize = windowSize;
    iv_tokenizer = new Tokenizer();
    BufferedReader br;
    try {
      br = new BufferedReader(new FileReader(new File(hyphenfilename)));


      String line = "";


      iv_shouldbeHyphenMap = new HashMap<String, Integer>();
      while ((line = br.readLine()) != null) {
        String[] toks = line.split("\\|");
        String[] unh = toks[0].split("\\-");
        String shouldbehyphen = "";
        for (int i = 0; i < unh.length; i++) {
          shouldbehyphen += " " + unh[i];
        }
        shouldbehyphen = shouldbehyphen.trim().toLowerCase();
        iv_shouldbeHyphenMap.put(shouldbehyphen, new Integer(1));
      }
    } catch (FileNotFoundException e) {
      System.err.println("Cannot find the hyphenation file:" + hyphenfilename);
      e.printStackTrace();
    } catch (IOException e) {
      System.err.println("IOException accessing the hyphenation file:" + hyphenfilename);
      e.printStackTrace();
    }


  }
  /**
   * Default constructor takes a name of the file containing hyphenated
   * phrases, with their frequency.
   * Currently the frequency is unused.<br>
   * The case of the words in the file is unimportant - we lowercase
   * everything when doing compares.<br>
   * The file is delimited with "|" and has two fields:<br>
   * hyphen-term|frequency
   */
  public HyphenTextModifierImpl(InputStream hyphenfilename, int windowSize) {
    iv_windowSize = windowSize;
    iv_tokenizer = new Tokenizer();
    BufferedReader br;
    try {
      br = new BufferedReader(new InputStreamReader(hyphenfilename));
      String line = "";


      iv_shouldbeHyphenMap = new HashMap<String, Integer>();
      while ((line = br.readLine()) != null) {
        String[] toks = line.split("\\|");
        String[] unh = toks[0].split("\\-");
        String shouldbehyphen = "";
        for (int i = 0; i < unh.length; i++) {
          shouldbehyphen += " " + unh[i];
        }
        shouldbehyphen = shouldbehyphen.trim().toLowerCase();
        iv_shouldbeHyphenMap.put(shouldbehyphen, new Integer(1));
      }
    } catch (FileNotFoundException e) {
      System.err.println("Cannot find the hyphenation file:" + hyphenfilename);
      e.printStackTrace();
    } catch (IOException e) {
      System.err.println("IOException accessing the hyphenation file:" + hyphenfilename);
      e.printStackTrace();
    }


  }  


  /**
   * Filters out unwanted tokens - newlines.
   * 
   * @param tokenList
   */
  private void filterTokens(List<Token> tokenList) {


    List<Token> removalList = new ArrayList<Token>();
    Iterator<Token> tokenItr = tokenList.iterator();


    while (tokenItr.hasNext()) {
      Token token = tokenItr.next();
      if (token.getType() == Token.TYPE_EOL) {
        removalList.add(token);
      }
    }


    tokenList.removeAll(removalList);
  }


  /*
   * (non-Javadoc)
   * 
   * @see edu.mayo.bmi.uima.util.ci.TextModifier#modify(java.lang.String)
   */
  public TextModification[] modify(String in) throws Exception {


    // intermediate data structure to use for easy adding of new
    // TextModification objects
    ArrayList<TextModification> textmods = new ArrayList<TextModification>();


    // Tokenize the input to get offset information
    List<Token> inputtoks = iv_tokenizer.tokenizeAndSort(in);


    filterTokens(inputtoks);


    int orig_startOffset = 0;
    int orig_endOffset = 0;
    int new_startOffset = 0;
    int new_endOffset = 0;


    int i = 0;
    int j = 0;
    int end_offset_adj = 0;
    int start_offset_adj = 0;


    while (i < inputtoks.size()) {


      if (inputtoks.size() - (i + 1) < iv_windowSize) {
        j = inputtoks.size() - 1;
      } else {
        j = i + iv_windowSize;
      }


      while (j > i) {


        StringBuffer candSB = new StringBuffer();
        for (int k = i; k <= j; k++) {
          Token currtok = (Token) inputtoks.get(k);
          candSB.append(" ");
          candSB.append(currtok.getText());
        }
        String cand = candSB.toString().trim();


        // Attempt to look up the candidate in the hyphen map
        if (iv_shouldbeHyphenMap.containsKey(cand.toLowerCase())) {


          // set the initial offsets
          orig_startOffset = ((Token) inputtoks.get(i)).getStartOffset();
          orig_endOffset = ((Token) inputtoks.get(j)).getEndOffset();
          new_startOffset = orig_startOffset;
          new_endOffset = orig_endOffset;


          // compile new text
          String newText = "";
          for (int k = i; k <= j; k++) {
            Token currtok = (Token) inputtoks.get(k);
            newText += currtok.getText() + "-";
          }
          newText = newText.substring(0, newText.length() - 1);


          // Get the new and old lengths of hyphenated spans
          int new_Length = newText.length();
          int orig_Length = orig_endOffset - orig_startOffset;


          // Pad the end offset adjuster by the new amount
          end_offset_adj += orig_Length - new_Length;


          // Create a new modification object
          TextModification tm = new TextModification(orig_startOffset, orig_endOffset, new_startOffset
              - start_offset_adj, new_endOffset - end_offset_adj, newText);


          // Adjust the start offset on the next Text Modification
          // object
          start_offset_adj += orig_Length - new_Length;


          // Put the newly created TextMod object into a temporary
          // holder
          textmods.add(tm);


          i = j;
        }
        j--;
      }


      i++;
    }


    // generate the expected return as an array of TextModification objects
    TextModification[] tma = new TextModification[textmods.size()];
    for (int y = 0; y < tma.length; y++) {
      tma[y] = (TextModification) textmods.get(y);
    }


    return tma;
  }


  
    /**
     * Apply text modifier to the text <br>
     * TODO - move this to <code>TextModifier</code> and take a <code>Logger</code>
     *     See <code>HyphenTextModifierImpl</code>
   * @param tm TextModifier to apply
   * @param text Original text
   * @param sb Buffer containing text to apply modifier to
     * @return unableToModifyText true if modifier would require offset changes, which is not supported by this method 
   * @throws Exception
     */
    private static boolean applyTextModifier(TextModifier tm, String text, StringBuffer sb) throws Exception {
      boolean unableToModifyText = false;
        TextModification[] textModArr = tm.modify(text);
        for (int i = 0; i < textModArr.length; i++) {


          TextModification textMod = textModArr[i];
            
            if ((textMod.getOrigStartOffset() != textMod.getNewStartOffset())
                    || (textMod.getOrigEndOffset() != textMod.getNewEndOffset())) {
                System.err.println("UNSUPPORTED: TextModification with offset changes.");
                unableToModifyText = true;
            }
            else {
              sb.replace(textMod.getOrigStartOffset(), 
                textMod.getOrigEndOffset(), 
                textMod.getNewText());
            }
        }  
        return unableToModifyText;
    }
  
    public static ArrayList<String> test(HyphenTextModifierImpl tm, String text) {
      ArrayList<String> messages = new ArrayList<String>();
      try {
      TextModification[] tma = tm.modify(text);
      StringBuffer sb = new StringBuffer(text);
      boolean errorModifyingText = applyTextModifier(tm,text,sb);
      messages.add("Orig: " + text);
      if (!errorModifyingText) {
        messages.add("New:  " + sb);
      }
      else {
        System.err.println("New:  (new text not generated, see previous messages)");        
      }
      // Regardless of whether was able to modify the text
      // without
      // (_apply_ the TextModifier), output the  
      // the 
      for (int u = 0; u < tma.length; u++) {
        TextModification tmo = (TextModification) tma[u];
        messages.add(tmo.getNewText() + " Orig: " + tmo.getOrigStartOffset() + "-"
            + tmo.getOrigEndOffset() + " New: " + tmo.getNewStartOffset() + "-" + tmo.getNewEndOffset());
      }
    } catch (Exception e) {
      // TODO Auto-generated catch block
      e.printStackTrace();
    }
    return messages;
      
    }
  /**
   * Simple tests of <code>TextModification</code>
   * <br>
   * Output expected:<br>
   *     UNSUPPORTED: TextModification with offset changes.<br>
   *     UNSUPPORTED: TextModification with offset changes.<br>
   *     UNSUPPORTED: TextModification with offset changes.<br>
   *      Orig: Non  Hodgkin's the x  ray without any non small  cell complications.<br>
   *      New:  (new text not generated, see previous messages)
   *     Non-Hodgkin Orig: 0-12 New: 0-11<br>
   *     x-ray Orig: 19-25 New: 18-23<br>
   *     non-small-cell Orig: 38-53 New: 36-50<br>
   * 
   *     Orig: Non Hodgkin's the x ray without any non small cell complications.<br>
   *     New:  Non-Hodgkin's the x-ray without any non-small-cell complications.<br>
   *     Non-Hodgkin Orig: 0-11 New: 0-11<br>
   *     x-ray Orig: 18-23 New: 18-23<br>
   *     non-small-cell Orig: 36-50 New: 36-50<br>
   * Note the case of the words doesn't matter. 
   * @param args hyphen text filename (each line: hyphenated-word|freq)
   */
  public static void main(String[] args) {
    ArrayList<String> messages;
    HyphenTextModifierImpl tm = new HyphenTextModifierImpl(args[0], 7);


    String t = "Non  Hodgkin's the x  ray without any non small  cell complications.";
    messages = test(tm, t); // extra blanks
    for (String s : messages) {  System.out.println(s); }


    t = t.replace("  ", " "); // change text to only have single blanks between words
    messages = test(tm, t); // single blanks
    for (String s : messages) {  System.out.println(s); }
  }


}
Source Code of org.apache.ctakes.core.ci.HyphenTextModifierImpl

Related Classes of org.apache.ctakes.core.ci.HyphenTextModifierImpl