Source Code of org.apache.ctakes.dictionary.lookup2.dictionary.DictionaryDescriptorParser

/**
 * Licensed to the Apache Software Foundation (ASF) under one
 * or more contributor license agreements.  See the NOTICE file
 * distributed with this work for additional information
 * regarding copyright ownership.  The ASF licenses this file
 * to you under the Apache License, Version 2.0 (the
 * "License"); you may not use this file except in compliance
 * with the License.  You may obtain a copy of the License at
 *
 *   http://www.apache.org/licenses/LICENSE-2.0
 *
 * Unless required by applicable law or agreed to in writing,
 * software distributed under the License is distributed on an
 * "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
 * KIND, either express or implied.  See the License for the
 * specific language governing permissions and limitations
 * under the License.
 */
package org.apache.ctakes.dictionary.lookup2.dictionary;


import org.apache.ctakes.dictionary.lookup2.consumer.TermConsumer;
import org.apache.ctakes.dictionary.lookup2.util.DictionarySpec;
import org.apache.ctakes.dictionary.lookup2.util.UmlsUserApprover;
import org.apache.log4j.Logger;
import org.apache.uima.UimaContext;
import org.apache.uima.analysis_engine.annotator.AnnotatorContextException;
import org.apache.uima.resource.ResourceAccessException;
import org.apache.uima.resource.ResourceInitializationException;
import org.jdom.Document;
import org.jdom.Element;
import org.jdom.JDOMException;
import org.jdom.input.SAXBuilder;


import java.io.File;
import java.io.IOException;
import java.lang.reflect.Constructor;
import java.lang.reflect.InvocationTargetException;
import java.util.Arrays;
import java.util.Collection;
import java.util.HashMap;
import java.util.Map;
import java.util.Properties;


/**
 * Parses the XML descriptor indicated by the {@code externalResource} for {@code RareWordTermsDescriptorFile}
 * in the XML descriptor for the Rare Word Term Lookup Annotator
 * {@link org.apache.ctakes.dictionary.lookup2.ae.DefaultJCasTermAnnotator}
 * </p>
 * If there is a problem with the descriptor then the whole pipeline goes down, so care must be taken by the User
 * and any messages (logged or otherwise) produced by this class should be as specific as possible.  Devs take notice.
 * <p/>
 * TODO
 * This parser can create a RareWordDictionary by wrapping the older Jdbc, Lucene, StringTable (CSV) descriptors.
 * However, to prevent the dependency upon the current Dictionary-Lookup module and its "Dictionary" interface,
 * all such code has been commented out.  Uncommenting, linking, and rebuilding is possible if use of an older dictionary
 * resource is required.
 * TODO
 * Author: SPF
 * Affiliation: CHIP-NLP
 * Date: 11/20/13
 */
final public class DictionaryDescriptorParser {


   // LOG4J logger based on class name
   static private final Logger LOGGER = Logger.getLogger( "DictionaryDescriptorParser" );


   /**
    * A <B>Utility Class</B> cannot be instantiated
    */
   private DictionaryDescriptorParser() {
   }


   /**
    * XML key specifying the section that defines each {@link RareWordDictionary} that should be used for annotation
    */
   static private final String DICTIONARIES_KEY = "rareWordDictionaries";
   /**
    * Each {@link RareWordDictionary} should have an id that specifies a unique name for that dictionary
    */
   static private final String NAME_ID = "id";
   /**
    * Each {@link RareWordDictionary} must have an external resource specified by the
    * {@code configurableDataResourceSpecifier} in the XML descriptor for the Rare Word Term Lookup Annotator
    * {@link org.apache.ctakes.dictionary.lookup2.ae.DefaultJCasTermAnnotator}.
    * The external resource <i>does not</i> need to be unique for each dictionary.
    */
   static private final String EXTERNAL_RESOURCE = "externalResourceKey";
   /**
    * Each {@link RareWordDictionary} can utilize or ignore the case of terms.   In most situations case sensitivity
    * is not beneficial, but it may be for some.  For instance, if it is an acronym dictionary then differentiating
    * between "WHO" (World Health Organization) and "who" is important.
    * The {@link org.apache.ctakes.dictionary.lookup2.ae.DefaultJCasTermAnnotator}
    * does ignores case and ignores this setting
    */
   static private final String CASE_SENSITIVE = "caseSensitive";
   /**
    * Each {@link RareWordDictionary} should have a numerical {@code typeId} that indicates the semantic group
    * to which the terms in the dictionary belong.  The standard cTakes type ids are numerical and listed in
    * {*link org.apache.ctakes.typesystem.type.constants.CONST} as
    * <ul>
    * <li>0  Unknown</li>
    * <li>1  Medication / Drug</li>
    * <li>2  Disease / Disorder</li>
    * <li>3  Sign / Symptom (Finding)</li>
    * <li>4  <i>Not Defined</i></li>
    * <li>5  Procedure</li>
    * <li>6  Anatomical Site</li>
    * <li>7  Clinical Attribute</li>
    * <li>8  Device</li>
    * <li>9  Lab</li>
    * <li>10 Phenomena</li>
    * </ul>
    * In truth, any coding scheme (Numerical or otherwise) can be used as long as a {@link org.apache.ctakes.dictionary.lookup2.consumer.TermConsumer}
    * is created to use it. That being said ...
    */
   private static final String TYPE_ID = "typeId";
   /**
    * Each {@link RareWordDictionary} must have a java implementation.
    * It is best if this is a {@link RareWordDictionary},
    * but it can also be an older org apache ctakes dictionary lookup Dictionary, in which case a
    * org apache ctakes dictionary lookup2 dictionary RareWordDictionaryWrapper will be used.
    * <p>The available implementation keys are:</p>
    * <ul>
    * <li>rareWordJdbc</li>
    * <li>rareWordUmls</li>
    * <li>rareWordBsv</li>
    * <li>luceneImpl</li>
    * <li>jdbcImpl</li>
    * <li>csvImpl</li>
    * </ul>
    */
   private static final String IMPLEMENTATION = "implementation";
   /**
    * XML key specifying the section that defines the single {@link org.apache.ctakes.dictionary.lookup2.consumer.TermConsumer} that should be used to
    * consume discovered terms.
    */
   static private final String CONSUMER_KEY = "rareWordConsumer";


   // Added 'maxListSize'.  Size equals max int by default  - used for lucene dictionaries
   private static int MAX_LIST_SIZE = Integer.MAX_VALUE; //ohnlp-Bugs-3296301


   /**
    * Initiates the parsing of the XML descriptor file containing definition of dictionaries and a consumer for the
    * Rare Word Term dictionary paradigm
    *
    * @param descriptorFile XML-formatted file, see the dictionary-lookup resources file {@code RareWordTermsUMLS.xml}
    *                       for an example
    * @param uimaContext    -
    * @return {@link org.apache.ctakes.dictionary.lookup2.util.DictionarySpec} with specification of dictionaries and a consumer as read from the
    *         {@code descriptorFile}
    * @throws AnnotatorContextException if the File could not be found/read or the xml could not be parsed
    */
   static public DictionarySpec parseDescriptor( final File descriptorFile, final UimaContext uimaContext )
         throws AnnotatorContextException {
      LOGGER.info( "Parsing dictionary specifications: " + descriptorFile.getPath() );
      final SAXBuilder saxBuilder = new SAXBuilder();
      Document doc;
      try {
         doc = saxBuilder.build( descriptorFile );
      } catch ( JDOMException jdomE ) {
         throw new AnnotatorContextException( "Could not parse " + descriptorFile.getPath(), new Object[0], jdomE );
      } catch ( IOException ioE ) {
         throw new AnnotatorContextException( "Could not parse " + descriptorFile.getPath(), new Object[0], ioE );
      }
      final Map<String, RareWordDictionary> dictionaries
            = parseDictionaries( uimaContext, doc.getRootElement().getChild( DICTIONARIES_KEY ) );
      final TermConsumer consumer = parseConsumerXml( uimaContext,
                                                              doc.getRootElement().getChild( CONSUMER_KEY ) );
      return new DictionarySpec( dictionaries.values(), consumer );
   }


   /**
    * Creates dictionary engines by parsing the section defined by {@link this.DICTIONARIES_KEY}
    *
    * @param uimaContext         -
    * @param dictionariesElement contains definition of all dictionaries
    * @return Mapping of dictionary names {@link this.NAME_ID} to new {@link RareWordDictionary} instances
    * @throws AnnotatorContextException if the resource specified by {@link this.EXTERNAL_RESOURCE} does not match
    *                                   the type specified by {@link this.IMPLEMENTATION} or for some reason could not be used
    */
   static private Map<String, RareWordDictionary> parseDictionaries( final UimaContext uimaContext,
                                                                           final Element dictionariesElement )
         throws AnnotatorContextException {
      final Map<String, RareWordDictionary> engines = new HashMap<String, RareWordDictionary>();
      final Collection dictatteers = dictionariesElement.getChildren();
      for ( Object dictatteer : dictatteers ) {
         if ( dictatteer instanceof Element ) {
            final String id = ((Element) dictatteer).getAttributeValue( NAME_ID );
            final RareWordDictionary dictionary = parseDictionaryXml( uimaContext, (Element) dictatteer );
            engines.put( id, dictionary );
         }
      }
      return engines;
   }


   /**
    * Creates a dictionary by parsing each child element of {@link this.DICTIONARIES_KEY}
    *
    * @param uimaContext       -
    * @param dictionaryElement contains the definition of a single dictionary
    * @return a dictionary or null if there is a problem
    * @throws AnnotatorContextException if any of a dozen things goes wrong
    */
   private static RareWordDictionary parseDictionaryXml( final UimaContext uimaContext,
                                                         final Element dictionaryElement )
         throws AnnotatorContextException {
      final String externalResourceKey = dictionaryElement.getAttributeValue( EXTERNAL_RESOURCE );
      final Boolean keepCase = Boolean.valueOf( dictionaryElement.getAttributeValue( CASE_SENSITIVE ) );
      final String entityTypeId = dictionaryElement.getAttributeValue( TYPE_ID );
      Object externalResource;
      try {
         externalResource = uimaContext.getResourceObject( externalResourceKey );
      } catch ( ResourceAccessException raE ) {
         throw new AnnotatorContextException( "Could not access external resource " + externalResourceKey,
                                              new Object[0], raE );
      }
      if ( externalResource == null ) {
         throw new AnnotatorContextException( "Could not find external resource " + externalResourceKey,
                                              new Object[0] );
      }
      RareWordDictionary dictionary = null;
      final Element implementationElement = (Element) dictionaryElement.getChild( IMPLEMENTATION ).getChildren().get( 0 );
      final String implementationName = implementationElement.getName();
      if ( implementationName.equals( "rareWordJdbc" ) ) {
         dictionary = DictionaryFactory.createRareWordJdbc( implementationElement,
                                                            externalResource,
                                                            entityTypeId );
      } else if ( implementationName.equals( "rareWordUmls" ) ) {
         try {
            UmlsUserApprover.validateUMLSUser( uimaContext );
            dictionary = DictionaryFactory.createRareWordJdbc( implementationElement,
                                                               externalResource,
                                                               entityTypeId );
         } catch ( ResourceInitializationException riE ) {
            throw new AnnotatorContextException( riE );
         }
      } else if ( implementationName.equals( "rareWordBsv" ) ) {
         dictionary = DictionaryFactory.createRareWordBsv( externalResourceKey, externalResource, entityTypeId );
//      } else if ( implementationName.equals( "luceneImpl" ) ) {
//         dictionary = DictionaryFactory.createWrappedLucene( dictionaryElement,
//                                                                     externalResourceKey,
//                                                                     externalResource,
//                                                                     entityTypeId );
//      } else if ( implementationName.equals( "jdbcImpl" ) ) {
//         dictionary = DictionaryFactory.createWrappedJdbc( dictionaryElement,
//                                                                   implementationElement,
//                                                                   externalResourceKey,
//                                                                   externalResource,
//                                                                   entityTypeId );
//      } else if ( implementationName.equals( "csvImp" ) ) {
//         dictionary = DictionaryFactory.createWrappedCsv( dictionaryElement,
//                                                                  implementationElement,
//                                                                  externalResourceKey,
//                                                                  externalResource,
//                                                                  entityTypeId );
      } else {
         throw new AnnotatorContextException( "Unsupported dictionary implementation " + implementationName,
                                              new Object[0] );
      }
      if ( dictionary == null ) {
         throw new AnnotatorContextException( "No appropriate dictionary defined", new Object[0] );
      }
      // Deprecated -
//      if ( dictionary instanceof Dictionary ) {
//         final Collection metaFields = dictionaryElement.getChild( "metaFields" ).getChildren();
//         for ( Object value : metaFields ) {
//            String metaFieldName = ((Element) value).getAttributeValue( "fieldName" );
//            ((Dictionary) dictionary).retainMetaData( metaFieldName );
//         }
//      }
      return dictionary;
   }






   /**
    * Creates a term consumer by parsing section defined by {@link this.CONSUMER_KEY}
    *
    * @param uimaContext           -
    * @param lookupConsumerElement contains the definition of the term consumer
    * @return a term consumer
    * @throws AnnotatorContextException if any of a dozen things goes wrong
    */
   private static TermConsumer parseConsumerXml( final UimaContext uimaContext,
                                                         final Element lookupConsumerElement ) throws
                                                                                               AnnotatorContextException {
      Class[] constrArgsConsum = {UimaContext.class, Properties.class, int.class};//ohnlp-Bugs-3296301
      Class[] constrArgsConsumB = {UimaContext.class, Properties.class};


      String consumerClassName = lookupConsumerElement.getAttributeValue( "className" );
      Element consumerPropertiesElement = lookupConsumerElement.getChild( "properties" );
      Properties consumerProperties = parsePropertiesXml( consumerPropertiesElement );
      Class consumerClass;
      try {
         consumerClass = Class.forName( consumerClassName );
      } catch ( ClassNotFoundException cnfE ) {
         throw new AnnotatorContextException( "Unknown class " + consumerClassName, new Object[0], cnfE );
      }
      if ( !TermConsumer.class.isAssignableFrom( consumerClass ) ) {
         throw new AnnotatorContextException( consumerClassName + " is not a TermConsumer",
                                              new Object[0] );
      }
      final Constructor[] constructors = consumerClass.getConstructors();
      for ( Constructor constructor : constructors ) {
         try {
            if ( Arrays.equals( constrArgsConsum, constructor.getParameterTypes() ) ) {
               final Object[] args = new Object[]{uimaContext, consumerProperties, MAX_LIST_SIZE}; //ohnlp-Bugs-3296301
               return (TermConsumer) constructor.newInstance( args );
            } else if ( Arrays.equals( constrArgsConsumB, constructor.getParameterTypes() ) ) {
               final Object[] args = new Object[]{uimaContext, consumerProperties};
               return (TermConsumer) constructor.newInstance( args );
            }
         } catch ( InstantiationException inE ) {
            throw new AnnotatorContextException( "Could not construct " + consumerClassName, new Object[0], inE );
         } catch ( IllegalAccessException iaE ) {
            throw new AnnotatorContextException( "Could not construct " + consumerClassName, new Object[0], iaE );
         } catch ( InvocationTargetException itE ) {
            throw new AnnotatorContextException( "Could not construct " + consumerClassName, new Object[0], itE );
         }
      }
      throw new AnnotatorContextException( "No Constructor for " + consumerClassName, new Object[0] );
   }


   /**
    * Builds a collection of key, value properties
    *
    * @param propertiesElement element with key, value pairs
    * @return Properties
    */
   private static Properties parsePropertiesXml( final Element propertiesElement ) {
      final Properties properties = new Properties();
      final Collection propertyElements = propertiesElement.getChildren();
      for ( Object value : propertyElements ) {
         final Element propertyElement = (Element) value;
         final String key = propertyElement.getAttributeValue( "key" );
         final String propertyValue = propertyElement.getAttributeValue( "value" );
         properties.put( key, propertyValue );
      }
      return properties;
   }


}
Source Code of org.apache.ctakes.dictionary.lookup2.dictionary.DictionaryDescriptorParser

Related Classes of org.apache.ctakes.dictionary.lookup2.dictionary.DictionaryDescriptorParser