Source Code of org.apache.ctakes.dictionary.lookup.ae.FirstTokenPermLookupInitializerImpl

/**
 * Licensed to the Apache Software Foundation (ASF) under one
 * or more contributor license agreements.  See the NOTICE file
 * distributed with this work for additional information
 * regarding copyright ownership.  The ASF licenses this file
 * to you under the Apache License, Version 2.0 (the
 * "License"); you may not use this file except in compliance
 * with the License.  You may obtain a copy of the License at
 *
 *   http://www.apache.org/licenses/LICENSE-2.0
 *
 * Unless required by applicable law or agreed to in writing,
 * software distributed under the License is distributed on an
 * "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
 * KIND, either express or implied.  See the License for the
 * specific language governing permissions and limitations
 * under the License.
 */
package org.apache.ctakes.dictionary.lookup.ae;


import org.apache.ctakes.core.util.JCasUtil;
import org.apache.ctakes.dictionary.lookup.DictionaryEngine;
import org.apache.ctakes.dictionary.lookup.algorithms.FirstTokenPermutationImpl;
import org.apache.ctakes.dictionary.lookup.algorithms.LookupAlgorithm;
import org.apache.ctakes.dictionary.lookup.phrasebuilder.PhraseBuilder;
import org.apache.ctakes.dictionary.lookup.phrasebuilder.VariantPhraseBuilderImpl;
import org.apache.ctakes.dictionary.lookup.vo.LookupAnnotation;
import org.apache.ctakes.dictionary.lookup.vo.LookupToken;
import org.apache.ctakes.typesystem.type.syntax.*;
import org.apache.log4j.Logger;
import org.apache.uima.UimaContext;
import org.apache.uima.analysis_engine.annotator.AnnotatorInitializationException;
import org.apache.uima.cas.text.AnnotationIndex;
import org.apache.uima.jcas.JCas;
import org.apache.uima.jcas.JFSIndexRepository;
import org.apache.uima.jcas.tcas.Annotation;


import java.util.*;


/**
 * @author Mayo Clinic
 */
public class FirstTokenPermLookupInitializerImpl implements LookupInitializer {


   static private final String TRUE_STRING = Boolean.toString( true );
   static private final String FALSE_STRING = Boolean.toString( false );


   // LOG4J logger based on class name
   final private Logger iv_logger = Logger.getLogger( getClass().getName() );


   // properties for firstWordPermutation algorithm
   static private final String TEXT_MFS_PRP_KEY = "textMetaFields";
   static private final String MAX_P_LEVEL_PRP_KEY = "maxPermutationLevel";
   static private final String WINDOW_ANNOT_PRP_KEY = "windowAnnotations";
   static private final String EXC_TAGS_PRP_KEY = "exclusionTags"; // optional


   static private final String CANONICAL_VARIANT_ATTR = "canonicalATTR";


   final private Properties iv_props;


   // array of JCas window annotation type values
   final private int[] iv_annotTypeArr;


   // set of exclusion POS tags (lower cased)
   final private Set<String> iv_exclusionTagSet;


   public FirstTokenPermLookupInitializerImpl( final UimaContext uimaContext,
                                               final Properties props ) {
      // TODO property validation could be done here
      iv_props = props;


      // optional context window annotations
      final String windowAnnots = iv_props.getProperty( WINDOW_ANNOT_PRP_KEY );
      if ( windowAnnots != null ) {
         String[] windowAnnotArr = windowAnnots.split( "\\|" );
         iv_annotTypeArr = new int[windowAnnotArr.length];
         for ( int i = 0; i < windowAnnotArr.length; i++ ) {
            iv_annotTypeArr[i] = JCasUtil.getType( windowAnnotArr[i] );
         }
      } else {
         iv_annotTypeArr = null;
      }


      // optional exclusion POS tags
      final String tagStr = iv_props.getProperty( EXC_TAGS_PRP_KEY );
      if ( tagStr != null ) {
         iv_exclusionTagSet = new HashSet<>();
         final String[] tagArr = tagStr.split( "," );
         for ( String tag : tagArr ) {
            iv_exclusionTagSet.add( tag.toLowerCase() );
         }
         iv_logger.info( "Exclusion tagset loaded: " + iv_exclusionTagSet );
      } else {
         iv_exclusionTagSet = null;
      }
   }


   /**
    * {@inheritDoc}
    */
   @Override
   public LookupAlgorithm getLookupAlgorithm( final DictionaryEngine dictEngine )
         throws AnnotatorInitializationException {
      final String textMetaFields = iv_props.getProperty( TEXT_MFS_PRP_KEY );
      String[] textMetaFieldNameArr;
      if ( textMetaFields == null ) {
         textMetaFieldNameArr = new String[0];
      } else {
         textMetaFieldNameArr = textMetaFields.split( "\\|" );
      }
      // variant support
      final String[] variantArr = {CANONICAL_VARIANT_ATTR};
      final PhraseBuilder pb = new VariantPhraseBuilderImpl( variantArr, true );
      final int maxPermutationLevel = Integer.parseInt( iv_props.getProperty( MAX_P_LEVEL_PRP_KEY ) );
      return new FirstTokenPermutationImpl( dictEngine, pb, textMetaFieldNameArr, maxPermutationLevel );
   }


   private boolean isTagExcluded( final String tag ) {
      return iv_exclusionTagSet != null && tag != null && iv_exclusionTagSet.contains( tag.toLowerCase() );
   }


   /**
    * {@inheritDoc}
    */
   @Override
   public Iterator<LookupToken> getLookupTokenIterator( final JCas jcas ) throws AnnotatorInitializationException {
      final List<LookupToken> ltList = new ArrayList<>();


      final JFSIndexRepository indexes = jcas.getJFSIndexRepository();
      final AnnotationIndex<Annotation> annotationIndex = indexes.getAnnotationIndex( BaseToken.type );
      for ( Annotation annotation : annotationIndex ) {
         if ( !(annotation instanceof BaseToken) ) {
            iv_logger.warn( getClass().getName() + " getLookupTokenIterator(..) Annotation is not a BaseToken" );
            continue;
         }
         final boolean isNonLookup = annotation instanceof NewlineToken
               || annotation instanceof PunctuationToken
               || annotation instanceof ContractionToken
               || annotation instanceof SymbolToken;
         if ( isNonLookup ) {
            continue;
         }
         final BaseToken bta = (BaseToken) annotation;
         final LookupToken lt = new LookupAnnotationToJCasAdapter( bta );
         // POS exclusion logic for first word lookup
         if ( isTagExcluded( bta.getPartOfSpeech() ) ) {
            lt.addStringAttribute( FirstTokenPermutationImpl.LT_KEY_USE_FOR_LOOKUP, FALSE_STRING );
         } else {
            lt.addStringAttribute( FirstTokenPermutationImpl.LT_KEY_USE_FOR_LOOKUP, TRUE_STRING );
         }
         if ( bta instanceof WordToken ) {
            final WordToken wta = (WordToken) bta;
            final String canonicalForm = wta.getCanonicalForm();
            if ( canonicalForm != null ) {
               lt.addStringAttribute( CANONICAL_VARIANT_ATTR, canonicalForm );
            }
         }
         ltList.add( lt );
      }
      return ltList.iterator();
   }


   /**
    * {@inheritDoc}
    */
   @Override
   public Iterator<Annotation> getLookupWindowIterator( final JCas jcas ) throws AnnotatorInitializationException {
      try {
         final JFSIndexRepository indexes = jcas.getJFSIndexRepository();
         final String objClassName = iv_props.getProperty( WINDOW_ANNOT_PRP_KEY );
         int windowType;
         try {
            windowType = JCasUtil.getType( objClassName );
         } catch ( IllegalArgumentException iaE ) {
            // thrown by JCasUtil.getType()
            throw new AnnotatorInitializationException( iaE );
         }
         return indexes.getAnnotationIndex( windowType ).iterator();
      } catch ( Exception e ) {
         // TODO specify exceptions, get rid of the catch for "Exception"
         throw new AnnotatorInitializationException( e );
      }
   }


   /**
    * {@inheritDoc}
    */
   @Override
   public Map<String, List<LookupAnnotation>> getContextMap( final JCas jcas,
                                                             final int windowBegin, final int windowEnd )
         throws AnnotatorInitializationException {
      if ( iv_annotTypeArr == null ) {
         return Collections.emptyMap();
      }
      final List<LookupAnnotation> list = new ArrayList<>();
      // algorithm depends on a window for permutations
      final JFSIndexRepository indexes = jcas.getJFSIndexRepository();
      for ( int annotationType : iv_annotTypeArr ) {
         final Iterator<Annotation> itr = indexes.getAnnotationIndex( annotationType ).iterator();
         list.addAll( constrainToWindow( windowBegin, windowEnd, itr ) );
      }
      final Map<String, List<LookupAnnotation>> m = new HashMap<>( 1 );
      m.put( FirstTokenPermutationImpl.CTX_KEY_WINDOW_ANNOTATIONS, list );
      return m;
   }


   /**
    * Gets a list of LookupAnnotation objects within the specified window.
    *
    * @param annotItr -
    * @return list of lookup annotations
    */
   private static List<LookupAnnotation> constrainToWindow( final int begin, final int end,
                                                     final Iterator<Annotation> annotItr ) {
      final List<LookupAnnotation> list = new ArrayList<>();
      while ( annotItr.hasNext() ) {
         final Annotation annot = annotItr.next();
         // only consider if it's within the window
         if ( (annot.getBegin() >= begin) && (annot.getEnd() <= end) ) {
            list.add( new LookupAnnotationToJCasAdapter( annot ) );
         }
      }
      return list;
   }


   /**
    * {@inheritDoc}
    */
   @Override
   public List<LookupToken> getSortedLookupTokens( final JCas jcas,
                                                   final Annotation covering ) throws AnnotatorInitializationException {
      final List<LookupToken> ltList = new ArrayList<>();
      final List<BaseToken> inputList = org.uimafit.util.JCasUtil.selectCovered( jcas, BaseToken.class, covering );
      for ( BaseToken bta : inputList ) {
         final boolean isNonLookup = bta instanceof NewlineToken
               || bta instanceof PunctuationToken
               || bta instanceof ContractionToken
               || bta instanceof SymbolToken;
         if ( isNonLookup ) {
            continue;
         }
         final LookupToken lt = new LookupAnnotationToJCasAdapter( bta );
         // POS exclusion logic for first word lookup
         if ( isTagExcluded( bta.getPartOfSpeech() ) ) {
            lt.addStringAttribute( FirstTokenPermutationImpl.LT_KEY_USE_FOR_LOOKUP, FALSE_STRING );
         } else {
            lt.addStringAttribute( FirstTokenPermutationImpl.LT_KEY_USE_FOR_LOOKUP, TRUE_STRING );
         }
         if ( bta instanceof WordToken ) {
            final WordToken wta = (WordToken) bta;
            final String canonicalForm = wta.getCanonicalForm();
            if ( canonicalForm != null ) {
               lt.addStringAttribute( CANONICAL_VARIANT_ATTR, canonicalForm );
            }
         }
         ltList.add( lt );
      }
      return ltList;
   }


}
Source Code of org.apache.ctakes.dictionary.lookup.ae.FirstTokenPermLookupInitializerImpl

Related Classes of org.apache.ctakes.dictionary.lookup.ae.FirstTokenPermLookupInitializerImpl