Source Code of org.apache.ctakes.ytex.tools.SetupAuiFirstWord

/**
 * Licensed to the Apache Software Foundation (ASF) under one
 * or more contributor license agreements.  See the NOTICE file
 * distributed with this work for additional information
 * regarding copyright ownership.  The ASF licenses this file
 * to you under the Apache License, Version 2.0 (the
 * "License"); you may not use this file except in compliance
 * with the License.  You may obtain a copy of the License at
 *
 *   http://www.apache.org/licenses/LICENSE-2.0
 *
 * Unless required by applicable law or agreed to in writing,
 * software distributed under the License is distributed on an
 * "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
 * KIND, either express or implied.  See the License for the
 * specific language governing permissions and limitations
 * under the License.
 */
package org.apache.ctakes.ytex.tools;


import java.io.BufferedInputStream;
import java.io.File;
import java.io.FileInputStream;
import java.io.FileNotFoundException;
import java.io.IOException;
import java.io.InputStream;
import java.net.URL;
import java.util.ArrayList;
import java.util.HashSet;
import java.util.Iterator;
import java.util.List;
import java.util.Locale;
import java.util.Set;


import javax.xml.parsers.DocumentBuilder;
import javax.xml.parsers.DocumentBuilderFactory;
import javax.xml.parsers.ParserConfigurationException;


import org.apache.commons.logging.Log;
import org.apache.commons.logging.LogFactory;
import org.springframework.transaction.PlatformTransactionManager;
import org.springframework.transaction.support.TransactionTemplate;
import org.w3c.dom.Document;
import org.w3c.dom.Element;
import org.w3c.dom.NodeList;
import org.xml.sax.SAXException;
import org.apache.ctakes.core.nlp.tokenizer.Token;
import org.apache.ctakes.core.nlp.tokenizer.TokenizerPTB;
import org.apache.ctakes.ytex.kernel.KernelContextHolder;
import org.apache.ctakes.ytex.umls.dao.UMLSDao;
import org.apache.ctakes.ytex.umls.model.UmlsAuiFirstWord;


import gov.nih.nlm.nls.lvg.Api.LvgCmdApi;


/**
 * setup umls_aui_fword table
 * 
 * @author vijay
 * 
 */
public class SetupAuiFirstWord {
  private static final Log log = LogFactory.getLog(SetupAuiFirstWord.class);
  // private static final Pattern nonWord = Pattern.compile("\\W");
  private TokenizerPTB tokenizer;
  private LvgCmdApi lvgCmd;
  private Set<String> exclusionSet = null;


  /**
   * Initialize tokenizer using the hyphen map from
   * "tokenizer/hyphenated.txt". Use freqCutoff of 0. If this is changed in
   * the TokenizerAnnotator.xml uima config, then the tokenization here will
   * not match the tokenization done during document processing.
   * <p/>
   * Initialize exclusionSet from LvgAnnotator.xml. The exclusion set should
   * be case insensitive, but it isn't that way in the LvgAnnotator so we
   * retain the same functionality.
   * <p/>
   * Initialize LVG. copied from
   * edu.mayo.bmi.uima.lvg.resource.LvgCmdApiResourceImpl.
   * 
   * @throws Exception
   */
  public SetupAuiFirstWord() throws Exception {
    initTokenizer();
    // initialize exclusion set
    initExclusionSet();
    initLvg();
  }


  /**
   * initialize lvgCmd
   */
  private void initLvg() {
    // See
    // http://lexsrv2.nlm.nih.gov/SPECIALIST/Projects/lvg/2008/docs/userDoc/index.html
    // See
    // http://lexsrv3.nlm.nih.gov/SPECIALIST/Projects/lvg/2008/docs/designDoc/UDF/flow/index.html
    // Lower-case the terms and then uninflect
    // f = using flow components (in this order)
    // l = lower case
    // b = uninflect a term
    try {
      URL uri = this.getClass().getClassLoader()
          .getResource("org/apache/ctakes/lvg/data/config/lvg.properties");
      if (log.isInfoEnabled())
        log.info("loading lvg.properties from:" + uri.getPath());
      File f = new File(uri.getPath());
      String configDir = f.getParentFile().getAbsolutePath();
      String lvgDir = configDir.substring(0, configDir.length()
          - "data/config".length());
      System.setProperty("user.dir", lvgDir);
      lvgCmd = new LvgCmdApi("-f:l:b", f.getAbsolutePath());
    } catch (Exception e) {
      log.warn(
          "could not initialize lvg - will not create a stemmed dictionary.",
          e);
    }
  }


  /**
   * initialize lvg exclusion set
   * 
   * @throws ParserConfigurationException
   * @throws SAXException
   * @throws IOException
   */
  private void initExclusionSet() throws ParserConfigurationException,
      SAXException, IOException {
    this.exclusionSet = new HashSet<String>();
    InputStream isLvgAnno = null;
    try {
      isLvgAnno = this
          .getClass()
          .getClassLoader()
          .getResourceAsStream(
              "ctakes-lvg/desc/analysis_engine/LvgAnnotator.xml");
      if(isLvgAnno == null) {
        log.warn("classpath:ctakes-lvg/desc/analysis_engine/LvgAnnotator.xml not available, attempting to load from file system");
        File f = new File("../ctakes-lvg/desc/analysis_engine/LvgAnnotator.xml");
        if(f.exists())
          isLvgAnno = new BufferedInputStream(new FileInputStream(f));
      } 
      if (isLvgAnno == null) {
        log.warn("ctakes-lvg/desc/analysis_engine/LvgAnnotator.xml not available, using empty exclusion set");
      } else {
        DocumentBuilderFactory dbFactory = DocumentBuilderFactory
            .newInstance();
        DocumentBuilder dBuilder = dbFactory.newDocumentBuilder();
        Document doc = dBuilder.parse(isLvgAnno);
        NodeList nList = doc.getElementsByTagName("nameValuePair");
        for (int i = 0; i < nList.getLength(); i++) {
          Element e = (Element) nList.item(i);
          String name = e.getElementsByTagName("name").item(0)
              .getChildNodes().item(0).getNodeValue();
          if ("ExclusionSet".equals(name)) {
            NodeList nListEx = e.getElementsByTagName("string");
            for (int j = 0; j < nListEx.getLength(); j++) {
              exclusionSet.add(nListEx.item(j).getChildNodes()
                  .item(0).getNodeValue());
            }
          }
        }
      }
    } finally {
      if (isLvgAnno != null)
        isLvgAnno.close();
    }
  }


  /**
   * initialize the tokenizer. loads the hypenated word list.
   * 
   * @throws FileNotFoundException
   * @throws IOException
   */
  private void initTokenizer() throws FileNotFoundException, IOException {
    this.tokenizer = new TokenizerPTB();
  }


  /**
   * @param args
   * @throws Exception
   */
  public static void main(String[] args) throws Exception {
    SetupAuiFirstWord setupFword = new SetupAuiFirstWord();
    setupFword.setupAuiFirstWord();
  }


  public void setupAuiFirstWord() {
    UMLSDao umlsDao = KernelContextHolder.getApplicationContext().getBean(
        UMLSDao.class);
    TransactionTemplate t = new TransactionTemplate(KernelContextHolder
        .getApplicationContext().getBean(
            PlatformTransactionManager.class));
    t.setPropagationBehavior(TransactionTemplate.PROPAGATION_REQUIRES_NEW);
    // delete all records
    // umlsDao.deleteAuiFirstWord();


    // get all auis and their strings
    // restart processing after the last aui we processed.
    // if this is null, then just process everything
    String lastAui = umlsDao.getLastAui();
    List<Object[]> listAuiStr = null;
    do {
      // get the next 10k auis
      listAuiStr = umlsDao.getAllAuiStr(lastAui);
      // put the aui - fword pairs in a list
      List<UmlsAuiFirstWord> listFword = new ArrayList<UmlsAuiFirstWord>(
          1000);
      for (Object[] auiStr : listAuiStr) {
        String aui = (String) auiStr[0];
        String str = (String) auiStr[1];
        lastAui = aui;
        if (str.length() < 200) {
          try {
            UmlsAuiFirstWord fw = this.tokenizeStr(aui, str);
            if (fw == null)
              log.error("Error tokenizing aui=" + aui + ", str="
                  + str);
            else if (fw.getFword().length() > 70)
              log.debug("fword too long: aui=" + aui + ", str="
                  + fw.getFword());
            else if (fw.getTokenizedStr().length() > 250)
              log.debug("string too long: aui=" + aui + ", str="
                  + str);
            else {
              if (log.isDebugEnabled())
                log.debug("aui=" + aui + ", fw=" + fw);
              listFword.add(fw);
            }
          } catch (Exception e) {
            log.error("Error tokenizing aui=" + aui + ", str="
                + str, e);
          }
        } else {
          log.debug("Skipping aui because str to long: aui=" + aui
              + ", str=" + str);
        }
      }
      // batch insert
      if (listFword.size() > 0) {
        umlsDao.insertAuiFirstWord(listFword);
        log.info("inserted " + listFword.size() + " rows");
      }
    } while (listAuiStr.size() > 0);
  }


  /**
   * tokenize the umls concept. copied from
   * edu\mayo\bmi\dictionarytools\CreateLuceneIndexFromDelimitedFile.java.
   * 
   * Stem the concept. Stemming performed analogous to LvgAnnotator.
   * 
   * @param aui
   * @param str
   * @return
   * @throws Exception
   */
  public UmlsAuiFirstWord tokenizeStr(String aui, String str)
      throws Exception {
    List<?> list = tokenizer.tokenize(str);
    Iterator<?> tokenItr = list.iterator();
    int tCount = 0;
    String firstTokenText = "";
    StringBuilder tokenizedDesc = new StringBuilder();
    String firstTokenStem = "";
    StringBuilder stemmedDesc = new StringBuilder();


    // get first word token and
    while (tokenItr.hasNext()) {
      tCount++;
      Token t = (Token) tokenItr.next();
      if (tCount == 1) {
        firstTokenText = t.getText(); // first token (aka "first word")
        tokenizedDesc.append(firstTokenText);
        if (this.lvgCmd != null) {
          firstTokenStem = stemToken(t);
          stemmedDesc.append(firstTokenStem);
        }
      } else { // use blank to separate tokens
        tokenizedDesc.append(" ").append(t.getText());
        // stem the next token, add it to the stemmed desc only if there
        // is a valid first word
        if (this.lvgCmd != null && firstTokenStem != null) {
          String stemmedWord = stemToken(t);
          stemmedDesc.append(" ").append(stemmedWord);
        }
      }
    }
    UmlsAuiFirstWord fw = new UmlsAuiFirstWord();
    fw.setAui(aui);
    fw.setFword(firstTokenText.toLowerCase(Locale.ENGLISH));
    fw.setTokenizedStr(tokenizedDesc.toString());
    if (this.lvgCmd != null) {
      fw.setFstem(firstTokenStem.toLowerCase(Locale.ENGLISH));
      fw.setStemmedStr(stemmedDesc.toString());
    }
    return fw;
  }


  /**
   * 
   * @param t
   *            token
   * @return stemmed text if token is a word and stemmed text is non-empty.
   *         else raw token text.
   * @throws Exception
   */
  private String stemToken(Token t) throws Exception {
    String stemmedWord = t.getText();
    if (Token.TYPE_WORD == t.getType() || Token.TYPE_UNKNOWN == t.getType()) {
      stemmedWord = this.getCanonicalForm(t.getText());
      if (stemmedWord == null || stemmedWord.length() == 0) {
        stemmedWord = t.getText();
      }
    }
    return stemmedWord;
  }


  /**
   * copied from edu.mayo.bmi.uima.lvg.ae.LvgAnnotator
   * 
   * @param word
   * @return
   * @throws Exception
   */
  private String getCanonicalForm(String word) throws Exception {
    if (lvgCmd == null || this.exclusionSet.contains(word))
      return null;
    String canonicalForm = null;
    String out = lvgCmd.MutateToString(word);
    // vng null check
    String[] output = null;
    if (out != null)
      output = out.split("\\|");
    else {
      log.warn("mutateToString returned null for: " + word);
    }


    if ((output != null) && (output.length >= 2)
        && (!output[1].matches("No Output"))) {
      canonicalForm = output[1];
    }
    return canonicalForm;
  }
}
Source Code of org.apache.ctakes.ytex.tools.SetupAuiFirstWord

Related Classes of org.apache.ctakes.ytex.tools.SetupAuiFirstWord