Source Code of org.apache.ctakes.core.nlp.tokenizer.TokenizerPTB

/**
 * Licensed to the Apache Software Foundation (ASF) under one
 * or more contributor license agreements.  See the NOTICE file
 * distributed with this work for additional information
 * regarding copyright ownership.  The ASF licenses this file
 * to you under the Apache License, Version 2.0 (the
 * "License"); you may not use this file except in compliance
 * with the License.  You may obtain a copy of the License at
 *
 *   http://www.apache.org/licenses/LICENSE-2.0
 *
 * Unless required by applicable law or agreed to in writing,
 * software distributed under the License is distributed on an
 * "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
 * KIND, either express or implied.  See the License for the
 * specific language governing permissions and limitations
 * under the License.
 */
package org.apache.ctakes.core.nlp.tokenizer;


import static java.lang.Character.isDigit;
import static java.lang.Character.isLetter;
import static java.lang.Character.isLetterOrDigit;
import static java.lang.Character.isUpperCase;
import static java.lang.Character.isWhitespace;
import static org.apache.ctakes.core.nlp.tokenizer.TokenizerHelper.APOSTROPHE;
import static org.apache.ctakes.core.nlp.tokenizer.TokenizerHelper.COMMA;
import static org.apache.ctakes.core.nlp.tokenizer.TokenizerHelper.CR;
import static org.apache.ctakes.core.nlp.tokenizer.TokenizerHelper.HYPHEN_OR_MINUS_SIGN;
import static org.apache.ctakes.core.nlp.tokenizer.TokenizerHelper.NEWLINE;
import static org.apache.ctakes.core.nlp.tokenizer.TokenizerHelper.PERIOD;
import static org.apache.ctakes.core.nlp.tokenizer.TokenizerHelper.isPunctuation;


import java.util.ArrayList;
import java.util.List;


import org.apache.uima.jcas.JCas;


import org.apache.ctakes.core.ae.TokenizerAnnotator;
import org.apache.ctakes.typesystem.type.syntax.BaseToken;
import org.apache.ctakes.typesystem.type.syntax.ContractionToken;
import org.apache.ctakes.typesystem.type.syntax.NewlineToken;
import org.apache.ctakes.typesystem.type.syntax.NumToken;
import org.apache.ctakes.typesystem.type.syntax.PunctuationToken;
import org.apache.ctakes.typesystem.type.syntax.SymbolToken;
import org.apache.ctakes.typesystem.type.syntax.WordToken;


/**
 * A class used to break natural text into tokens following PTB rules.
 * See Supplementary Guidelines for ETTB 2.0
 * dated April 6th, 2009. 
 * The token markup is external to the text and is not embedded.
 * Character offset location is used to identify the boundaries of a token.
 * 
 * @author Mayo Clinic
 */
public class TokenizerPTB {
    
      /**
   * Constructor
   */
  public TokenizerPTB() {
  }


  
  static final String [] emptyStringList = new String[0];
  static final ArrayList<BaseToken> emptyTokenList = new ArrayList<BaseToken>();
      




  /**
   * Tokenize text that starts at offset offsetAdjustment within the complete text
   * @param textSegment the text to tokenize
   * @param offsetAdjustment what to add to all offsets within textSegment to make them be offsets from the start of the text for the jcas
   * @param includeTextNotJustOffsets whether to copy the text covered by this token into the token object itself
   * @return the list of new tokens
   */
  public List<?> tokenizeTextSegment(JCas jcas, String textSegment, int offsetAdjustment, boolean includeTextNotJustOffsets) {
    
      String lowerCasedText = textSegment.toLowerCase();
    ArrayList<Object> tokens = new ArrayList<Object>();
    Class<? extends BaseToken> tokenClass = null; 
    
    // if input was null or empty, return empty token list
    if (textSegment==null || textSegment.length()==0) return emptyTokenList;
    
    // find first character of a token
    int currentPosition = 0;
    currentPosition = findFirstCharOfNextToken(textSegment, currentPosition);
    // if input contained only white space but not even any newlines, return empty token list 
    if (currentPosition < 0) return emptyTokenList;
    
    while ((currentPosition = findFirstCharOfNextToken(textSegment, currentPosition))>=0) {
    
      // get current character and the one after that, which is used in making a number
      // of decisions. if at the end of the input, use '\0' to represent the non-existent 
      // character after the current one just to avoid dealing with null
      char firstCharOfToken = textSegment.charAt(currentPosition);
      int NOT_SET_INDICATOR = -999;
      int tokenLen = NOT_SET_INDICATOR; // should set it below to a real value
      Object bta;
      




      if (currentPosition+1 >= textSegment.length()) {
                  // we found the start of a token, but it was the last character in the input,
                  // so it is a 1-character token
                  tokenLen = 1;
                  tokenClass = null; // null indicates that we don't know yet what the class is


      } 
      
      // else we have at least 2 characters to consider
      
      else if (isWhitespace(textSegment.charAt(currentPosition+1))) {
          // Since the following character is whitespace, and the current character
          // is the first character of a token, the current character is a one-character token
          tokenLen = 1; 
          tokenClass = null; // null indicates that we don't know yet what the class is


      }
      
      else if (firstCharOfToken == NEWLINE) {
        tokenLen = 1; 
        tokenClass = NewlineToken.class;
      }
      
      else if (firstCharOfToken == CR) {
      
          char peekAhead;
          peekAhead = textSegment.charAt(currentPosition+1);
          if (peekAhead != NEWLINE) { 
        tokenLen = 1; 
        tokenClass = NewlineToken.class;
          }
          else { 
        // create CR followed by LF as single end-of-line marker
        tokenLen = 2; // skip an extra one to skip both the CR and the LF
        tokenClass = NewlineToken.class;
          }
      
      
      }
      
      else if (firstCharOfToken==PERIOD) {
          // check if decimal number without the leading digits
          int len = getLengthIfIsNumberThatStartsWithPeriod(currentPosition, textSegment);
          if (len > 0) {
        tokenClass = NumToken.class;
        tokenLen = len; 
          } 


          else if (isEllipsis(currentPosition, textSegment)) {
        tokenLen = 3; 
        tokenClass = PunctuationToken.class;
          } else {
        // Abbreviation does not start with period, and not part of some other token, so it is punctuation
        tokenLen = 1; 
        tokenClass = PunctuationToken.class;
          }


      } 


      else if (firstCharOfToken==HYPHEN_OR_MINUS_SIGN) {
          // If it's the first character of a token, then this is not a hyphenated term that
          // was supposed to be kept as one token, or we would have included it in the previous token
          // Also telephone numbers do not start with a dash
          // So assume the hyphen/dash char is a one-character token like in 5-6 or in -400
          tokenLen = 1; 
          tokenClass = PunctuationToken.class;
      } 
      
      else if (firstCharOfToken==APOSTROPHE) { 
          // "can't" is not part of this case because the n is the start of the second token
          // The 've part of should've is not handled here, when something like should've or he'll
          // is found, 2 tokens are created (elsewhere)
          
          // Check if start of a Name
          int len = getLengthIfNameStartingWithApostrophe(currentPosition, textSegment);
          if (len > 0) {
        tokenLen = len;
        tokenClass = WordToken.class;
          } else if (ContractionsPTB.isContractionThatStartsWithApostrophe(currentPosition, lowerCasedText)) { 
        // 'tis and 'twas which get tokenized as  "'t is"  and  "'t was"
        tokenLen = 2;
        tokenClass = ContractionToken.class;
        // the "is" or "was" part will become a token on the next iteration
        // TODO potential place to add some self-checking code
          } else { // is separate punctuation mark
        tokenLen = 1;   
        tokenClass = PunctuationToken.class;
          }
      } 
      
      else if (isPunctuation(firstCharOfToken)) { // other than any handled above
          // Already handled minus sign and leading period (which could be part of a decimal)
          // Since not processing 'web-text', no need to look for things like :)
          // so is some type of 1-character punctuation token
          tokenLen = 1; 
          tokenClass = PunctuationToken.class;
      } 
      
      else if (isLetterOrDigit(firstCharOfToken)) {


          boolean obviouslyIsWord = true; // until we find a non alphanum before a whitespace
          boolean obviouslyIsNumber = true; // until we find a non digit before a whitespace
          int nextWhitespaceOrEndOfSegment = -1;
          int nextNonLetterOrNonDigit = -1;
          int nextNonLetterDigitApostrophe = -1;
          int nextNonTelephoneOrPostalChar = -1; // digits and dash aka hyphen
          int nextNonNumericChar = -1; // 9,876.012345  is an example with all the numeric chars 
          int nextNonDigit = -1;
          
          // First check the easy case - if just letters and digits until next whitespace (or until end of segment)
          // then that is a word or a number, can skip all the other logic to check for +hyphens
          // or contractions etc
          int i = currentPosition;
          char ch;
          do {
        ch = textSegment.charAt(i);
        if (isWhitespace(ch)) {
            if (nextNonLetterOrNonDigit < 0) nextNonLetterOrNonDigit = i;
            if (nextNonLetterDigitApostrophe < 0) nextNonLetterDigitApostrophe = i;
            if (nextNonDigit < 0) nextNonDigit = i;
            if (nextNonTelephoneOrPostalChar < 0) nextNonTelephoneOrPostalChar = i;
            if (nextNonNumericChar < 0) nextNonNumericChar = i;
            nextWhitespaceOrEndOfSegment = i;
        } else if (!isLetterOrDigit(ch)) {
            obviouslyIsWord = false; // not sure if it will be word all the way to whitespace
            obviouslyIsNumber = false; // not sure if it will be number all the way to whitespace
            if (nextNonLetterOrNonDigit < 0) nextNonLetterOrNonDigit = i;
            if (nextNonLetterDigitApostrophe < 0 && ch!=APOSTROPHE) {
          nextNonLetterDigitApostrophe = i;
            }
            if (nextNonDigit < 0) nextNonDigit = i;
            if (nextNonTelephoneOrPostalChar < 0 && !isTelephoneNumberChar(ch)) {
          nextNonTelephoneOrPostalChar = i;
            }
            if (nextNonNumericChar < 0 && !isNumericChar(ch)) {
          nextNonNumericChar = i;
            }
            // don't break here though, keep going to set nextWhitespace correctly for other uses
        } else if (!isDigit(ch)) {
            obviouslyIsNumber = false; // not sure if it will be number all the way to whitespace
            // since passed nextNonLetterOrNonDigit test above, must be letter, so nextNonLetterOrNonDigit is not changed here
            // since passed !isLetterOrDigit test above, must be letter, so nextNonLetterDigitApostrophe is not changed here
            if (nextNonDigit < 0) nextNonDigit = i;
            if (nextNonTelephoneOrPostalChar < 0 && !isTelephoneNumberChar(ch)) {
          nextNonTelephoneOrPostalChar = i;
            }
            if (nextNonNumericChar < 0 && !isNumericChar(ch)) {
          nextNonNumericChar = i;
            }
        } else {
            // else is a digit, none of the flags need to be set for digit characters.
        }


        i++;
          } while (i < textSegment.length() && !isWhitespace(ch));


          if (i>=textSegment.length()) {
        if (nextWhitespaceOrEndOfSegment < 0) nextWhitespaceOrEndOfSegment = textSegment.length();
        if (nextNonLetterOrNonDigit < 0) nextNonLetterOrNonDigit = textSegment.length();
        if (nextNonLetterDigitApostrophe < 0) nextNonLetterDigitApostrophe = textSegment.length();
        if (nextNonTelephoneOrPostalChar < 0) nextNonTelephoneOrPostalChar = textSegment.length();
        if (nextNonNumericChar < 0) nextNonNumericChar = textSegment.length();
          }
          //System.err.println("nextWhitespaceOrEndOfSegment = " + nextWhitespaceOrEndOfSegment);


          
          if (obviouslyIsNumber) {
            tokenLen = nextWhitespaceOrEndOfSegment - currentPosition;
            tokenClass = NumToken.class;
          } else if (obviouslyIsWord) {
        // Check for things like "cannot" and "gonna" that appear to be one token but
        // are supposed to be more than one according to PTB rules.
        String lowerCasedSubstring = textSegment.substring(currentPosition, nextWhitespaceOrEndOfSegment).toLowerCase();
        int len = ContractionsPTB.lenOfFirstTokenInContraction(lowerCasedSubstring);
        if (len > 0) { // is a contraction that doesn't contain an apostrophe, like "gonna", create WordToken for first part, 
                // and create ContractionToken for other token(s)  
            tokenLen = len;
            tokenClass = WordToken.class;
            bta = createToken(tokenClass, textSegment, jcas, currentPosition, currentPosition+tokenLen, offsetAdjustment);
            tokens.add(bta);
            currentPosition+=tokenLen; // currentPosition
            
            len = ContractionsPTB.lenOfSecondTokenInContraction(lowerCasedSubstring);
            
            tokenLen = len;
            tokenClass = ContractionToken.class;
            
            len = ContractionsPTB.lenOfThirdTokenInContraction(lowerCasedSubstring);
            if (len>0) { // if there is a 3rd, create the 2nd and set up for the 3rd to be created later
          bta = createToken(tokenClass, textSegment, jcas, currentPosition, currentPosition+tokenLen, offsetAdjustment);
          tokens.add(bta);
          currentPosition+=tokenLen; // currentPosition


          tokenLen = len;
          tokenClass = ContractionToken.class;
            }
        } else {
            tokenLen = nextWhitespaceOrEndOfSegment - currentPosition;
            tokenClass = WordToken.class;
        }
          } else { // Still within the "isLetterOrDigit(firstCharOfToken)" but not obviously number or word


        int len;


        
        ContractionResult cr;
        
        // Not sure what the token is, the token could extend to 
        // include all to the end of an email address, 
        // or include all to the end of a URL, 
        // or include all to the end of a URL, 
        // or through the next period (for an abbreviation)
        // or to the next hyphen, 
        // or beyond, 
        // or to the next whitespace (note already handle case of all alphanums to whitespace
        // or to the end of input (note already handle case of all alphanums to end of input
        // or the next apostrophe (for a most contractions) 
        // or until "n't" for such contractions
        // or the next other punctuation symbol
        // or beyond (for 80's)
        // or could include some punctuation like 3,245.51


        // Need to check for things like 80's before checking for contractions or else 80's looks like a contraction
              if (nextNonLetterOrNonDigit < lowerCasedText.length() && lowerCasedText.charAt(nextNonLetterOrNonDigit)==APOSTROPHE) {
                  String lowerCasedSubstring = lowerCasedText.substring(currentPosition, nextWhitespaceOrEndOfSegment);
                  len = ContractionsPTB.tokenLengthCheckingForSingleQuoteWordsToKeepTogether(lowerCasedSubstring);
                  if (len > nextNonLetterOrNonDigit-currentPosition) { // if keeping the apostrophe attached
                tokenLen = len;
                tokenClass = wordTokenOrNumToken(lowerCasedText, currentPosition, tokenLen);
                  } // else let contraction checking later determine what to do


                  
              }
              if (tokenLen == NOT_SET_INDICATOR) { // not found yet
                  if ((cr = ContractionsPTB.getLengthIfNextApostIsMiddleOfContraction(currentPosition, nextNonLetterOrNonDigit, lowerCasedText)) != null) {
                len = cr.getWordTokenLen();
                tokenLen = len;
                tokenClass = WordToken.class;
                char c = lowerCasedText.charAt(currentPosition+len);
                if (c=='n' || c==APOSTROPHE) { // if a "n't" contraction or a contraction where contraction token starts with '
                    if (tokenLen < 0) throw new RuntimeException("c = " + c + "tokenLen = " + tokenLen + " currentPosition = " + currentPosition);
                    // First create the WordToken (no apostrophe)
                    if(tokenLen > 0){
                      bta = createToken(tokenClass, textSegment, jcas, currentPosition, currentPosition+tokenLen, offsetAdjustment);
                      //System.out.println("bta = " + bta + " class = " + bta.getClass() + " tokenLen = " + tokenLen + " currentPosition = " + currentPosition);
                      tokens.add(bta);
                      currentPosition+=tokenLen; // currentPosition
                    }
                    // Set up to create the second token, for other contractions, the next token will start with an 
                    // apostrophe and be handled above... but for "n't" contractions, next token won't start with apostrophe
                    // so just go ahead and handle it here instead of having to keep track of previous 
                    // and handle n't in next loop.
                    tokenLen = cr.getContractionTokenLen();
                    // if (tokenLen!=3) throw new RuntimeException("getContractionTokenLen != 3 for n't");
                    tokenClass = ContractionToken.class;
                } else {
                    throw new RuntimeException("ERROR: getLengthIfNextApostIsMiddleOfContraction returned " + len + " but the character (" + c +") after that is not 'n' or apostrophe ");
                }
            
            
                  } else if ((len = lenIfIsTelephoneNumber(currentPosition, lowerCasedText, nextNonTelephoneOrPostalChar)) > 0) {
                tokenLen = len;
                tokenClass = WordToken.class;
                  } else if ((len = lenIfIsPostalCode(currentPosition, lowerCasedText, nextNonTelephoneOrPostalChar)) > 0) {
                tokenLen = len;
                tokenClass = WordToken.class;
                  } else if ((len = lenIfIsUrl(currentPosition, lowerCasedText, nextWhitespaceOrEndOfSegment)) > 0) {
                tokenLen = len;
                tokenClass = WordToken.class;
                  } else if ((len = lenIfIsEmailAddress(currentPosition, lowerCasedText, nextWhitespaceOrEndOfSegment)) > 0) {
                tokenLen = len;
                tokenClass = WordToken.class;
                  } else if ((len = lenIfIsAbbreviation(currentPosition, textSegment, nextWhitespaceOrEndOfSegment)) > 0) {
                tokenLen = len;
                tokenClass = WordToken.class;
                  } else { // Still within the "isLetterOrDigit(firstCharOfToken)".
                // not obviously a word or number (already checked those)
                // and not Url, EmailAddress, or Abbreviation
                // There could be a hyphen before the next white space,
                // or a symbol before the next whitespace
                // or apostrophe like in 80's or P'yongyang (one token each) or James' or Ted's (2 tokens each)
                // Take alphanums, but consider hyphenated words and names with apostrophes 
                // and consider tele numbers and postal codes


                //            if (true) { // TBD comment out this debug code
                //          System.out.println("lowerCasedSubstring = " + quoted(lowerCasedSubstring));
                //          System.out.println("currentPosition = " + currentPosition);
                //          System.out.println("nextWhitespaceOrEndOfSegment = " + nextWhitespaceOrEndOfSegment);
                //          System.out.println("nextNonLetterOrNonDigit = " + nextNonLetterOrNonDigit);
                //          System.out.println("nextNonLetterDigitApostrophe = " + nextNonLetterDigitApostrophe);
                //            }


                if (nextNonLetterOrNonDigit<lowerCasedText.length() && lowerCasedText.charAt(nextNonLetterOrNonDigit)==HYPHEN_OR_MINUS_SIGN) {
                    // telephone numbers and postal codes handled above already
                    String lowerCasedSubstring = lowerCasedText.substring(currentPosition, nextWhitespaceOrEndOfSegment);
                    len = HyphenatedPTB.tokenLengthCheckingForHyphenatedTerms(lowerCasedSubstring);
                    tokenLen = len;
                    if (tokenLen < 0) throw new RuntimeException("tokenLen = " + tokenLen + " currentPosition = " + currentPosition + " nextNonLetterOrNonDigit = " + nextNonLetterOrNonDigit);
                    tokenClass = wordTokenOrNumToken(lowerCasedText, currentPosition, tokenLen);
                } else if (nextNonNumericChar > 0 && (len = lenIfIsNumberContainingComma(currentPosition, lowerCasedText, nextNonNumericChar)) > 0) {
                    tokenLen = len;
                    tokenClass = NumToken.class;
                } else if (nextNonLetterDigitApostrophe < lowerCasedText.length() && lowerCasedText.charAt(nextNonLetterDigitApostrophe)==PERIOD) {
                    // see if is a number with a decimal place (without commas, comma-containing numbers are handled above)
                    if (nextNonDigit==lowerCasedText.length()-1) {
                  // end of sentence, don't include the period as part of the number, count it as end of sentence marker (punctuation)
                  tokenLen = nextNonDigit - currentPosition;
                  //if (tokenLen<1) throw new RuntimeException("Period at end of sentence " + nextNonDigit + " " + nextNonLetterDigitApostrophe+" "+tokenLen+ " " + lowerCasedText);
                  tokenClass = NumToken.class;
                    } else if (nextNonLetterDigitApostrophe==nextNonDigit) {
                  // if not end of sentence, do include period (decimal point) in the NumToken
                  tokenLen = nextNonDigit + 1 + getLenToNextNonDigit(lowerCasedText, nextNonDigit+1) - currentPosition;
                  tokenClass = NumToken.class;
                    }
                    else {
                  // something like 2J3. which is not a number or 2'3.
                  tokenLen = nextNonLetterOrNonDigit - currentPosition;
                  tokenClass = wordTokenOrNumToken(lowerCasedText, currentPosition, tokenLen);
                    }
                } else { // breaking character is not - character and not ' character, so stop there
                    tokenLen = nextNonLetterOrNonDigit - currentPosition;
                    tokenClass = wordTokenOrNumToken(lowerCasedText, currentPosition, tokenLen);
                }
                //} else {
                //    throw new UnsupportedOperationException("nextNonLetterDigitApostrophe = " + nextNonLetterDigitApostrophe);
                //}


                  }


              }
          }
          
      } else { // some other symbol or punctuation not included in isPunctuation
          // Since not processing 'web-text', no need to look for things like :)
          // so it is some type of 1-character symbol token
          tokenLen = 1; 
          tokenClass = SymbolToken.class;
      }
      
      // add the token created
      if (tokenLen < 0) throw new RuntimeException("tokenLen = " + tokenLen + " currentPosition = " + currentPosition);
      bta = createToken(tokenClass, textSegment, jcas, currentPosition, currentPosition+tokenLen, offsetAdjustment);
      //System.out.println("bta = " + bta + " class = " + bta.getClass() + " tokenLen = " + tokenLen + " currentPosition = " + currentPosition);
      tokens.add(bta);
      currentPosition+=tokenLen; // currentPosition


    } // end while loop
    
    return tokens;
    
  }


  /**
   * Tokenize a string that is assumed to be the entire document (or at least to start at 0)
   * @param text the String to tokenize
   * @return the list of new tokens
   */
  public List<?> tokenize(String text) {
      int offsetAdjustment = 0;
      JCas jcas = null;
      return tokenizeTextSegment(jcas, text, offsetAdjustment, true);
  }
  
  
  private static char DASH = '-';




  /**
   * such as -4,012.67 or 5 or 5.5 or 4,000,153
   * @param currentPosition
   * @param text
   * @param nextNonNumericChar
   * @return
   */
  private int lenIfIsNumberContainingComma(int currentPosition, String text, int nextNonNumericChar) {
      String s = text.substring(0, nextNonNumericChar); // use substring so don't search until end of entire document
      int commaPosition = s.indexOf(COMMA, currentPosition);
      if (commaPosition<0) return -1;
      if (commaPosition>nextNonNumericChar) return -1;
      int len = -1;
      
      int periodPosition = s.indexOf(PERIOD, currentPosition);
      int endOfWholeNumberPart = periodPosition;
      if (endOfWholeNumberPart<0) endOfWholeNumberPart = s.length();
      // the whole number part can contain commas as long as there are exactly 3 digits after each comma
      if (commaPosition>endOfWholeNumberPart) return -1; // if comma appears after the decimal point, then no commas in the whole-number-part
      if (commaPosition==0) return -1; // can't start with comma


      int position = commaPosition;  
      
      boolean didNotFindExactlyThreeDigitsAfterComma = false;
      
      while (!didNotFindExactlyThreeDigitsAfterComma) {
    len = position-currentPosition; // don't include the comma unless also can include next 3 digits
    if (position<endOfWholeNumberPart && s.charAt(position)==COMMA) {
        position++;
    }
    for (int i=0; i<3; i++) { // 3 digits after the comma if comma is part of a number
        if (position<endOfWholeNumberPart && isDigit(s.charAt(position))) {
      position++;
        } else {
      didNotFindExactlyThreeDigitsAfterComma = true;
        }
    }
    if (position<endOfWholeNumberPart && isDigit(s.charAt(position))) { // can't have 4 digits after comma like 3,4567  
        didNotFindExactlyThreeDigitsAfterComma = true; 
    }
      }      
      
      if (len <= 0)  return -1;
      
      // See if there is a decimal point that can continue the number, such as  3,456.56  or 4,012.
      // But if the sentences ends with the period that follows the whole_number_part, count it as the sentence marker 
      // not as part of the number
      if (periodPosition != text.length()-1 && // not the final period of a sentence
        periodPosition == currentPosition+len) { // but the period does appear right after the whole_number_part
    len++; 
    while (len<nextNonNumericChar-currentPosition && isDigit(s.charAt(currentPosition+len))) {
        len++;
    }
      }
      
      return len;
  }


  
  private int lenIfIsPostalCode(int currentPosition, String text, int nextNonPostalCodeChar) {
      if (nextNonPostalCodeChar < 0) return nextNonPostalCodeChar;
      
      int len = nextNonPostalCodeChar-currentPosition;
      
      String s = text.substring(currentPosition, nextNonPostalCodeChar);
      // 55901-0000
      
      if (len == 10) { // 55901-0001
    if (!isDigit(s.charAt(0))) return -1;
    if (!isDigit(s.charAt(1))) return -1;
    if (!isDigit(s.charAt(2))) return -1;
    if (!isDigit(s.charAt(3))) return -1;
    if (!isDigit(s.charAt(4))) return -1;
    if (s.charAt(5)!=DASH) return -1;
    if (!isDigit(s.charAt(6))) return -1;
    if (!isDigit(s.charAt(7))) return -1;
    if (!isDigit(s.charAt(8))) return -1;
    if (!isDigit(s.charAt(9))) return -1;
    return len;
      } else {
    return -1;
      }
      
  }


  
  private int lenIfIsTelephoneNumber(int currentPosition, String text, int nextNonTelephoneNumberChar) {
      
      if (nextNonTelephoneNumberChar < 0) return nextNonTelephoneNumberChar;
      
      int len = nextNonTelephoneNumberChar-currentPosition;
      
      String s = text.substring(currentPosition, nextNonTelephoneNumberChar);
      // extension like 4-5555
      // or without area code like 555-1212
      // or with area code 507-555-1212
      // or with 1, like 1-507-555-1212
      // or like example in guidelines like 02-2348-2192
      
      if (len==6) {
    if (!isDigit(s.charAt(0))) return -1;
    if (s.charAt(1)!=DASH) return -1;
    if (!isDigit(s.charAt(2))) return -1;
    if (!isDigit(s.charAt(3))) return -1;
    if (!isDigit(s.charAt(4))) return -1;
    if (!isDigit(s.charAt(5))) return -1;
    return len;
      } else if (len == 8) {
    if (!isDigit(s.charAt(0))) return -1;
    if (!isDigit(s.charAt(1))) return -1;
    if (!isDigit(s.charAt(2))) return -1;
    if (s.charAt(3)!=DASH) return -1;
    if (!isDigit(s.charAt(4))) return -1;
    if (!isDigit(s.charAt(5))) return -1;
    if (!isDigit(s.charAt(6))) return -1;
    if (!isDigit(s.charAt(7))) return -1;
    return len;
      } else if (len == 12) { // two possible formats
    // first check  507-555-1212 format
    if (!isDigit(s.charAt(0))) return checkFormat2(s);
    if (!isDigit(s.charAt(1))) return checkFormat2(s);
    if (!isDigit(s.charAt(2))) return checkFormat2(s);
    if (s.charAt(3)!=DASH) return checkFormat2(s);
    if (!isDigit(s.charAt(4))) return checkFormat2(s);
    if (!isDigit(s.charAt(5))) return checkFormat2(s);
    if (!isDigit(s.charAt(6))) return checkFormat2(s);
    if (s.charAt(7)!=DASH) return checkFormat2(s);
    if (!isDigit(s.charAt(8))) return checkFormat2(s);
    if (!isDigit(s.charAt(9))) return checkFormat2(s);
    if (!isDigit(s.charAt(10))) return checkFormat2(s);
    if (!isDigit(s.charAt(11))) return checkFormat2(s);
    return len;
      } else if (len == 14) { // 1-507-555-1212
    if (!isDigit(s.charAt(0))) return -1;
    if (s.charAt(1)!=DASH) return -1;
    if (!isDigit(s.charAt(2))) return -1;
    if (!isDigit(s.charAt(3))) return -1;
    if (!isDigit(s.charAt(4))) return -1;
    if (s.charAt(5)!=DASH) return -1;
    if (!isDigit(s.charAt(6))) return -1;
    if (!isDigit(s.charAt(7))) return -1;
    if (!isDigit(s.charAt(8))) return -1;
    if (s.charAt(9)!=DASH) return -1;
    if (!isDigit(s.charAt(10))) return -1;
    if (!isDigit(s.charAt(11))) return -1;
    if (!isDigit(s.charAt(12))) return -1;
    if (!isDigit(s.charAt(13))) return -1;
    return len;
      } else {
    return -1;
      }
      
  }


  private int checkFormat2(String s) { // 02-2348-2192
    if (!isDigit(s.charAt(0))) return -1;
    if (!isDigit(s.charAt(1))) return -1;
    if (s.charAt(2)!=DASH) return -1;
    if (!isDigit(s.charAt(3))) return -1;
    if (!isDigit(s.charAt(4))) return -1;
    if (!isDigit(s.charAt(5))) return -1;
    if (!isDigit(s.charAt(6))) return -1;
    if (s.charAt(7)!=DASH) return -1;
    if (!isDigit(s.charAt(8))) return -1;
    if (!isDigit(s.charAt(9))) return -1;
    if (!isDigit(s.charAt(10))) return -1;
    if (!isDigit(s.charAt(11))) return -1;
      
      return -1;
  }
  
  /**
   * "0123456789-"
   * @param ch
   * @return
   */
  private boolean isTelephoneNumberChar(char ch) { 
      return (isDigit(ch) || ch=='-');
  }


  /**
   * ",.0123456789"
   * @param ch
   * @return
   */
  private boolean isNumericChar(char ch) { // 
      return (isDigit(ch) || ch==',' || ch=='.');
  }


  private int getLenToNextNonDigit(String s, int startingPosition) {
      char ch;
      int i = 0;
      while (startingPosition+i < s.length()) {
    ch = s.charAt(startingPosition+i);
    if (!isDigit(ch)) {
        return i; 
    }
    i++;
      }
      return s.length()-startingPosition;
  }


  private Class<? extends BaseToken> wordTokenOrNumToken(String lowerCasedText, int currentPosition, int tokenLen) {
      if (containsLetter(lowerCasedText, currentPosition, tokenLen)) {
    return WordToken.class;
      } else {
    return NumToken.class;
      }
  }


  /**
   * 
   * @param lowerCasedText
   * @param currentPosition
   * @param tokenLen
   * @return true if at least one of the characters between currentPosition and currentPosition+tokenLen is a letter
   */
  private boolean containsLetter(String lowerCasedText, int currentPosition, int tokenLen) {
      for (int i=currentPosition; i<currentPosition+tokenLen; i++) {
    char c = lowerCasedText.charAt(i);
    if (isLetter(c)) {
        return true;
    }
      }
      return false;
  }


  
  private static String ellipsis = "...";  
  private boolean isEllipsis(int currentPosition, String textSegment) {
      if (textSegment.substring(currentPosition).startsWith(ellipsis)) return true;
      return false;
  }


  static String [] nameStartingWithApostrophe = {"'assad", "'awarta", "'ashira", };
  


  private int getLengthIfNameStartingWithApostrophe(int currentPosition, String textSegment) {
      
      String textLowerCased = textSegment.substring(currentPosition).toLowerCase();
      if (textLowerCased.length() == 1) return -1; // if no more chars after the apostrophe, it's a 1-char token
      if (!isLetter(textSegment.charAt(currentPosition+1))) {
    return -1;
      }
      
      // Could be the start of a quoted string like "'The boy ran', she said" or could be the start of a name like 'Assad
      for (String s:nameStartingWithApostrophe) {
    if (s.length() == textLowerCased.length()) {
        // exactly matches the rest of the input....
        if (textLowerCased.startsWith(s)) return  s.length();
    } else if (s.length() > textLowerCased.length()) {
        ; // can't be match, not long enough
    } else if (textLowerCased.startsWith(s)) {
        return  s.length(); // already checked above that next char after ' exists and is not letter
        
    } else {
        
        // don't want "'The boy ran'" to have "'The" be one token so don't assume 
        // it's a quoted name unless it is in the specific list above
        ; // do nothing in this case, try the next from nameStartingWithApostrophe
                    //        int end = currentPosition+s.length();
                    //        char next = textSegment.charAt(end);
                    //        // ok as long as just more letters
                    //        while (isLetter(next) && end < textSegment.length()) {
                    //      end++;      
                    //        }
                    //        return end-currentPosition; // is correct whether ran out of chars or found non letter 
        
    }
      }
    
      
      return -1;
  }








  private int getLengthIfIsNumberThatStartsWithPeriod(int currentPosition, String textSegment) {
      int len = textSegment.length() -  currentPosition;  
      if (len<2) return -1;
      int index=currentPosition+1;
      char ch = textSegment.charAt(index);
      if (!isDigit(ch)) return -1;
      index++;
      while (index < currentPosition+len){
    ch = textSegment.charAt(index);
    if (!isDigit(ch)) return index-currentPosition;
    index++;
      } 
      
      return len; // all rest were digits
  }




  /**
   * Assumes no white space between currentPosition and endOfInputToConsider
   * If last of a sentence is a period, then don't include the period with the abbreviation,
   * count it as punctuation.
   * That way we don't have to differentiate between "mg." being an abbreviation and "me." being simply
   * the end of a sentence
   * @param currentPosition
   * @param mixedCaseText
   * @param afterEndOfInputToConsider
   * @return
   */
  private int lenIfIsAbbreviation(int currentPosition, String mixedCaseText, int afterEndOfInputToConsider) {
    // Determine if all up to endOfInputToConsider contains at least 1 letter and ends with period
    // Note input is known to contain at least 1 letter or otherwise would have already been determined to be a number
    boolean containsLetter = false;
    // consider as single abbreviation things like e.g. but for things like 
    // www.nlm.nih.gov (without the http) count as separate tokens 
    if (afterEndOfInputToConsider-currentPosition >= 4 && mixedCaseText.substring(currentPosition, currentPosition+4).toLowerCase().equals("www.")) {
      return -1;
    }
    for (int i = currentPosition; i < afterEndOfInputToConsider; i++) { 
      char ch = mixedCaseText.charAt(i);
      char peekAhead;
      if (i+1 < afterEndOfInputToConsider) { 
        peekAhead = mixedCaseText.charAt(i+1);
      } else {
        peekAhead = ' ';
      }


      if (isLetter(ch)) {
        containsLetter = true;
      } else if (ch != PERIOD) { // if any symbol is found before the period, not considering it an abbreviation
        return -1;
      } else if (!containsLetter || (i+1 == mixedCaseText.length())) {
        return -1; // no letter, or last character of sentence is this period, in which case period is end of sentence marker, not part of abbreviation
      } else { // is a period and there was a letter before it and this period is not last char in sentence
        // If before the period there are alphanums with at least one letter, and we are 
        // not at the end of the sentence, consider the period to be part of the preceding
        // If there are more alphanums after, also terminated by period, include that too 
        // like in A.D. or e.g.
        int soFar = (i + 1 - currentPosition);
        int len = lenIfIsAbbreviation(i+1, mixedCaseText, afterEndOfInputToConsider);
        // If what's after the period satisfies abbreviation definition itself
        if (len>0) {
          return (soFar + len);
        }
        // else len<=0 and so what's after the period is not more abbreviation
        
        if (Character.isWhitespace(peekAhead) || isPossibleFinalPunctuation(peekAhead)) {
          // "e.g. edema" does have the abbreviation e.g. within it
          return soFar;
        } else if (!isLetterOrDigit(peekAhead)) { // "e.g.[1]" does have the abbreviation e.g. within it
          return soFar-1;
        }


        // "e.g.abc" is not an abbreviation because the abc follows the . immediately
        return -1; // period is end of sentence or is between alphanums


      }
    }


    // No period found - just all letters
    return -1; 


  }


  private String possibleFinalPunctuation = "?!:";
  private boolean isPossibleFinalPunctuation(char c) {
    if (possibleFinalPunctuation.indexOf(c) > -1) return true;
    return false;
  }


  private String validOtherEmailAddressCharacters = "!#$%&'*+/=?^_`{|}~-"; // those that can be used without quoting or escaping them


  /**
   * Assumes no white space between currentPosition and endOfInputToConsider
   * @param currentPosition
   * @param lowerCasedText
   * @param endOfInputToConsider
   * @return
   */
  private int lenIfIsEmailAddress(int currentPosition, String lowerCasedText, int endOfInputToConsider) {


      int maxLenLocalPart = 64;
      int maxTotalLen = 320;
      int len = -1;
      // (?:[a-z0-9!#$%&'*+/=?^_`{|}~-]
      // @
      // (?:[a-z0-9](?:[a-z0-9-]*[a-z0-9])?\.)+[a-z0-9](?:[a-z0-9-]*[a-z0-9])
      
      char AT = '@';
      char PERIOD = '.'; // as String not char
      int indexOfAt = lowerCasedText.substring(currentPosition, endOfInputToConsider).indexOf(AT);
      if (indexOfAt < 1 || currentPosition+indexOfAt+1==endOfInputToConsider || indexOfAt > maxLenLocalPart) { // '@' can't be the first character, but must be present, and can't be last char 
    // if no @ sign, or not in a valid position, don't bother doing anything more complicated, can't be an email address
    return -1;
      }
      
      
      // @see http://tools.ietf.org/html/rfc3696#section-3


      // ignoring quoted or escape chars
      // ignoring ability to use IP address in square brackets for domain part
     
      // First validate the local part (the part before the @ sign)
      //String localPart = textSegment.substring(currentPosition, currentPosition+indexOfAt);      
      for (int i=currentPosition; i<currentPosition+indexOfAt; i++) {
    char ch = lowerCasedText.charAt(i);
    CharSequence cs = lowerCasedText.subSequence(i, i+1);
    if (!isLetterOrDigit(ch) && !validOtherEmailAddressCharacters.contains(cs)) {
        return -1;
    }
    if (ch == PERIOD && (i==currentPosition || i == currentPosition+indexOfAt-1)) { // first and last of local name can't be period
        return -1;
    }
      }
      
      char prev = '@';
      // The local part appears to be the right format for a valid email address, validate the domain part
      for (int i=currentPosition+indexOfAt+1; i<endOfInputToConsider; i++) {
    char ch = lowerCasedText.charAt(i);
    //CharSequence cs = textSegment.subSequence(i, i+1);
    if (isLetterOrDigit(ch)) { 
        ; // fine, continue
    } else if (ch==HYPHEN_OR_MINUS_SIGN || ch==PERIOD) {
        // either stop one earlier, or error, or include at least one more char
        // Is there at least one more valid character?
        if (i+1<endOfInputToConsider && isLetterOrDigit(lowerCasedText.charAt(i+1)) ) {
      ; // keep going
        } else if (isLetterOrDigit(prev)) {
      return i-currentPosition-1;
        } else { 
      return -1;
        }
    } else { //something else that ends the token, like an exclamation point
        if (isLetterOrDigit(prev)) return i-currentPosition-1;
        else return -1;
    }
      }
      
      len = endOfInputToConsider - currentPosition;
      if (len > maxTotalLen) return -1;
      return len;
  }




  private static String [] urlStarters = { "http://", "https://", "ftp://", "mailto:" };


  private int lenIfIsUrl(int currentPosition, String lowerCasedText, int endOfInputToConsider) {


      // http://host:port/path?search#fragment
      // mailto:joe@example.com
      //
      String potentialUrl = lowerCasedText.substring(currentPosition, endOfInputToConsider);
      for (String s: urlStarters) {
    if (potentialUrl.startsWith(s) && potentialUrl.length() > s.length()) {
        return endOfInputToConsider - currentPosition; // same as  potentialUrl.length()
    }
      }


      return -1;
  }






  //
  private Class<? extends BaseToken> determineTokenType(String s, int begin, int end) {
    
      if (s==null || s.length() < end || begin+1!=end) {
    Exception e = new Exception("ERROR: s not at least one char:  s= " + s + " begin, end = " + begin + "," + end);
    //System.err.println("ERROR: s.length()!=1  s=" + s);
    e.printStackTrace();
    return null; // strings longer than 1 are not supported yet
      }    
      
      char ch = s.charAt(begin);
      if (ch == NEWLINE || ch == CR) return NewlineToken.class;
      if (isDigit(ch)) return NumToken.class;
      if (isLetter(ch)) return WordToken.class;
      if (isContraction(ch)) return ContractionToken.class; 
      if (isPunctuation(ch)) return PunctuationToken.class;
      // if none of the above, it must be a SymbolToken
      return SymbolToken.class;
    
  }


  // classic cTAKES (1.0.5) has a single ContractionToken for the input "It's hard."
  // That token contains the two characters "'s"
  // and "It" is a BaseToken
  private boolean isContraction(char c) {
      // single character cannot be a contraction token. need something like 't or 's or n't 
      return false;
  }


  private boolean verify(int begin, int end, int offsetAdjustment) {
      Exception e = new Exception(begin + " " + end + " " + offsetAdjustment);
      if (begin < 0) {
    System.err.println("ERROR: begin = " + begin);
    e.printStackTrace();
    return false;
      }
      if (end < 0) {
    System.err.println("ERROR: end = " + end);
    e.printStackTrace();
    return false;
      }
      if (end < begin) {
    System.err.println("ERROR: end < begin " + end + " < " + begin);
    e.printStackTrace();
    return false;
      }
      if (offsetAdjustment < 0) {
    System.err.println("ERROR: offsetAdjustment = " + offsetAdjustment);
    e.printStackTrace();
    return false;
      }
      
      return true;
      
  }
  
  /**
   * if clas is null, determine token class for the caller
   * if jcas is null,  
   * @see org.apache.ctakes.core.ae.TokenConverter#convert(org.apache.ctakes.core.nlp.tokenizer.Token, org.apache.uima.jcas.JCas, int)
   */
  private Object createToken(Class<? extends BaseToken> clas, String s, JCas jcas, int begin, int end, int offsetAdjustment) {




      int beginFromStartOfDocument = begin + offsetAdjustment;
      int endFromStartOfDocument = end + offsetAdjustment;


      Object token;


      
      if (true) {
    boolean ok = verify(beginFromStartOfDocument, endFromStartOfDocument, offsetAdjustment);
    if (!ok) {
        System.err.println("ERROR: so creating a BaseToken with begin = 0 end = 0 just to avoid exception");
        if (jcas!=null) token = new BaseToken(jcas, 0, 0); else token = new Token(0,0);
        return token;
    }


      }


      if (clas == null) { // determine the type for the caller


    Class<? extends BaseToken> clss = determineTokenType(s, begin, end);
    if (clss==null) throw new RuntimeException(" still is null");
    
    if (jcas!=null) 
        token = createToken(clss, s, jcas, begin, end, offsetAdjustment);
    else {
        token = new Token(begin, end);
        ((Token)token).setText(s.substring(begin, end));
    }


      } else if (clas.equals(NewlineToken.class)) {


    if (jcas!=null) 
        token = new NewlineToken(jcas, beginFromStartOfDocument, endFromStartOfDocument);
    else { 
        token = new Token(beginFromStartOfDocument, endFromStartOfDocument);
        ((Token)token).setText(s.substring(begin, end));
    }


      } else if (clas.equals(NumToken.class)) {


    if (jcas!=null) { 
        token = new NumToken(jcas, beginFromStartOfDocument, endFromStartOfDocument);
        String tokenText = s.substring(begin, end);
        setNumType((NumToken)token, tokenText);
    }
    else {
        token = new Token(beginFromStartOfDocument, endFromStartOfDocument);
        ((Token)token).setText(s.substring(begin, end));
    }
    
      } else if (clas.equals(WordToken.class)) {


    if (jcas!=null) { 
        token = new WordToken(jcas, beginFromStartOfDocument, endFromStartOfDocument);
        String tokenText = s.substring(begin, end);
        setCapitalization((WordToken)token, tokenText);
        setNumPosition((WordToken)token, tokenText);
    }
    else  {
        token = new Token(beginFromStartOfDocument, endFromStartOfDocument);
        ((Token)token).setText(s.substring(begin, end));
    }


      } else if (clas.equals(SymbolToken.class)) {


    if (jcas!=null) 
        token = new SymbolToken(jcas, beginFromStartOfDocument, endFromStartOfDocument);
    else {
        token = new Token(beginFromStartOfDocument, endFromStartOfDocument);
        ((Token)token).setText(s.substring(begin, end));
    }




      } else if (clas.equals(PunctuationToken.class)) {


    if (jcas!=null) 
        token = new PunctuationToken(jcas, beginFromStartOfDocument, endFromStartOfDocument);
    else {
        token = new Token(beginFromStartOfDocument, endFromStartOfDocument);
        ((Token)token).setText(s.substring(begin, end));
    }
        


      } else if (clas.equals(ContractionToken.class)) {


    if (jcas!=null) 
        token = new ContractionToken(jcas, beginFromStartOfDocument, endFromStartOfDocument);
    else {
        token = new Token(beginFromStartOfDocument, endFromStartOfDocument);
        ((Token)token).setText(s.substring(begin, end));
    }


      } else if (clas.equals(BaseToken.class)) {


    if (jcas!=null) 
        token = new BaseToken(jcas, beginFromStartOfDocument, endFromStartOfDocument);
    else {
        token = new Token(beginFromStartOfDocument, endFromStartOfDocument);
        ((Token)token).setText(s.substring(begin, end));
    }


      } else {


    if (jcas!=null) { 
        System.err.println("clas=" + clas + " and need to add more code here to support that class");
        token = null;
    }
    else {
        token = new Token(beginFromStartOfDocument, endFromStartOfDocument);
        ((Token)token).setText(s.substring(begin, end));
    }
    
      }






            //      if (true) { // TBD remove this debug code
            //    System.out.println(" ---------------------------------------------------- ");
            //    System.out.println("token = " + token);
            //    System.out.println("token.getCoveredText() = " + token.getCoveredText());
            //    System.out.println("token.getClass().getName() = " + token.getClass().getName());
            //    Class cl = token.getClass();
            //    Field [] fields = cl.getFields();
            //    for (Field f: fields) {
            //        System.out.println(f.getName() + " " + f);
            //      try {
            //          System.out.println("  " + f.getInt(token));
            //      } catch (IllegalArgumentException e) {
            //          System.out.println("IllegalArgumentException"); //e.printStackTrace();
            //      } catch (IllegalAccessException e) {
            //          System.out.println("IllegalAccessException"); //e.printStackTrace();
            //      }
            //        
            //    }
            //      }


      return token;
  
  }


  /**
   * @see org.apache.ctakes.core.nlp.tokenizer.Tokenizer#isNumber
   */
  private void setNumType(NumToken nta, String tokenText) {
      if (org.apache.ctakes.core.nlp.tokenizer.Tokenizer.isNumber(tokenText) && !tokenText.contains(".")) {
    nta.setNumType(TokenizerAnnotator.TOKEN_NUM_TYPE_INTEGER);
      } else {
    nta.setNumType(TokenizerAnnotator.TOKEN_NUM_TYPE_DECIMAL);
      }
  }


  private void setNumPosition(WordToken wta, String tokenText) {
    if ( tokenText.isEmpty() ) {
      // was getting ioobE from tokenText.charAt(..)
      // Possibilities like this (empty, null) should always be checked
      // - but I wonder that we get (want) empty tokens at all.
      // I believe that working with zero-length words is a bug, 
      // and this is not a fix it merely avoids a crash.
      wta.setNumPosition( TokenizerAnnotator.TOKEN_NUM_POS_NONE );
      return;
    }
      if (isDigit(tokenText.charAt(0)))  {


    wta.setNumPosition(TokenizerAnnotator.TOKEN_NUM_POS_FIRST);


      } else if (isDigit(tokenText.charAt(tokenText.length()-1))) {


    wta.setNumPosition(TokenizerAnnotator.TOKEN_NUM_POS_LAST);


      } else {


    boolean containsDigit = false;
    for (int i=0; i<tokenText.length(); i++) {
        char ch = tokenText.charAt(i);
        if (isDigit(ch)) containsDigit = true; 
    }


    if (containsDigit) { 
        wta.setNumPosition(TokenizerAnnotator.TOKEN_NUM_POS_MIDDLE);
    } else { 
        wta.setNumPosition(TokenizerAnnotator.TOKEN_NUM_POS_NONE);
    }


      }


  }


  /**
   * @see org.apache.ctakes.core.nlp.tokenizer.Tokenizer#applyCapitalizationRules
   */
  private void setCapitalization(WordToken wta, String tokenText) {


      int countUpperCase = 0;
      boolean containsNonUpperCase = false;
      for (int i=0; i<tokenText.length(); i++) {
    char ch = tokenText.charAt(i);
    if (isUpperCase(ch)) {
        countUpperCase++;
    } else {
        containsNonUpperCase = true;
    }
      }


      if (countUpperCase==0) { 
    wta.setCapitalization(TokenizerAnnotator.TOKEN_CAP_NONE); 
      } else if (!containsNonUpperCase) {
    wta.setCapitalization(TokenizerAnnotator.TOKEN_CAP_ALL); 
      } else if (countUpperCase==1 && isUpperCase(tokenText.charAt(0))) {
    wta.setCapitalization(TokenizerAnnotator.TOKEN_CAP_FIRST_ONLY); 
      } else {
    wta.setCapitalization(TokenizerAnnotator.TOKEN_CAP_MIXED); 
      }


  }




  /*
   * Find the index of the first character of the next token, where
   * the index is >= startPosition, and the previous token ended at
   * startPosition-1 (or there was no previous token for the 1st time)
   * Returns -1 if there are no more tokens (eof or all white space
   * but no newlines)
   */
  public int findFirstCharOfNextToken(String s, int startPosition) {


      for (int position = startPosition; position < s.length(); position++) {


    // find a non-whitespace character or a newline
    // A newline is the start of a NewlineToken
    if (position < 0) {
        System.out.println("position = " + position);
    }
    char c = s.charAt(position);


    if (!isWhitespace(c)) { // the only token that can start with whitespace is a NewlineToken
        return position;
    }


    if (isEndOfLine(c)) {
        return position;
    }


    //      char peekAhead;
    //      peekAhead = NEWLINE; // treat EOF like newline for tokenization purposes
    //      if (position+1<s.length()) peekAhead = s.charAt(position+1);


      }


      return -1;


  }


  private boolean isEndOfLine(char c) {
      if (c==NEWLINE || c==CR) return true;
      return false;
  }


  static String [] testsForNumbers = {"2,000,123.For", "92,000,123.", "2,000,123.", "2,000,123.0", "2,000,13", "2", "2.", "2,", "22", "12345678901@4", "2.2.2."};


  static String [] testsForEmailAddress = {"masanz@mayo.edu", "masanz@mayo", "m@l", "m.@p", "m.n.@p", "3@4",
      "%@f", "R@@", "MASANZ@MAYO", "jk$jk@.m", "asdf@.m$", "masanz.james-mi@ibm.com.us", ".mn@p", ".@p", "@t",  };


  public static void main(String[] args) {


      runEmailTests();
      runNumberTests();
  }


  static void runNumberTests() {
      TokenizerPTB tester = new TokenizerPTB();
      int len;
      for (String s: testsForNumbers) {
    len = tester.lenIfIsNumberContainingComma(0, s, Math.min(s.length(),11));
    System.out.println("========== Test NumberWithComma ========== ");
    System.out.println(s);
    System.out.println(len);
    
      }
      
  }
  static void runEmailTests() {


      TokenizerPTB tester = new TokenizerPTB();


      for (String s: testsForEmailAddress) {


    int i = tester.lenIfIsEmailAddress(0, s, s.length());
    String prepend = "XYZ";
    int j = tester.lenIfIsEmailAddress(prepend.length(), prepend+s, s.length() + prepend.length());
    System.out.println("========== Test ========== ");
    System.out.println("      0123456789ABCDEF");
    System.out.println("  s = " + s + "\t  and prepend+s = " + prepend+s);
    System.out.println("  lenIfIsEmailAddress = " + i + "\t   and if prepend, len = " + j);






      }


  }


}




// createToken(Class clas, String s, JCas jcas, int begin, int end, int offsetAdjustment) {
//      {
//    try {
//        System.out.println("INFO: Creating " + (clas==null? "(null class)": clas.getName()));
//        System.out.println("INFO:   token for '" + s.substring(begin,end) + "'");
//    } catch (Exception e) {
//        System.out.println("ERROR:  Unable to print substring for '" + s + "'" + begin + "," + end);        
//        e.printStackTrace(); 
//    } finally {
//        System.out.println("INFO:                                   full string is  '" + s + "'  " + begin + "," + end + "," + offsetAdjustment); 
//    }
//    
//      }
Source Code of org.apache.ctakes.core.nlp.tokenizer.TokenizerPTB

Related Classes of org.apache.ctakes.core.nlp.tokenizer.TokenizerPTB