Source Code of org.apache.uima.annotator.dict_annot.dictionary.DictionaryMatcherTest

/*
 * Licensed to the Apache Software Foundation (ASF) under one
 * or more contributor license agreements.  See the NOTICE file
 * distributed with this work for additional information
 * regarding copyright ownership.  The ASF licenses this file
 * to you under the Apache License, Version 2.0 (the
 * "License"); you may not use this file except in compliance
 * with the License.  You may obtain a copy of the License at
 * 
 *   http://www.apache.org/licenses/LICENSE-2.0
 * 
 * Unless required by applicable law or agreed to in writing,
 * software distributed under the License is distributed on an
 * "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
 * KIND, either express or implied.  See the License for the
 * specific language governing permissions and limitations
 * under the License.
 */
package org.apache.uima.annotator.dict_annot.dictionary;


import java.io.BufferedInputStream;
import java.io.File;
import java.io.FileInputStream;
import java.io.InputStream;
import java.util.ArrayList;


import javax.xml.parsers.SAXParser;
import javax.xml.parsers.SAXParserFactory;


import junit.framework.Assert;
import junit.framework.TestCase;


import org.apache.uima.UIMAFramework;
import org.apache.uima.annotator.dict_annot.dictionary.impl.DictionaryFileParserImpl;
import org.apache.uima.annotator.dict_annot.dictionary.impl.FeaturePathInfo;
import org.apache.uima.annotator.dict_annot.dictionary.impl.HashMapDictionaryBuilder;
import org.apache.uima.annotator.dict_annot.impl.FeaturePathInfo_impl;
import org.apache.uima.cas.CAS;
import org.apache.uima.cas.FSIterator;
import org.apache.uima.cas.Type;
import org.apache.uima.cas.impl.XCASDeserializer;
import org.apache.uima.cas.text.AnnotationFS;
import org.apache.uima.resource.metadata.FsIndexDescription;
import org.apache.uima.resource.metadata.TypeSystemDescription;
import org.apache.uima.test.junit_extension.JUnitExtension;
import org.apache.uima.util.CasCreationUtils;
import org.apache.uima.util.XMLInputSource;


/**
 * Tests if the dictionary matches works correctly.
 */
public class DictionaryMatcherTest extends TestCase {


   /**
    * Test matcher that takes the text and the dictionary and adds all matches
    * to the array list.
    * 
    * @param dict
    *           dictionary to use
    * @param tokens
    *           tokenized string
    * @param matches
    *           match list
    */
   public void match(Dictionary dict, AnnotationFS[] annotFSs, FeaturePathInfo featPathInfo,
         ArrayList<String> matches) {
      int currentPos = 0;
      while (currentPos < annotFSs.length) {


         DictionaryMatch dictMatch = dict.matchEntry(currentPos, annotFSs, featPathInfo);
         if (dictMatch != null) {
            // we have found a match starting at currentPos
            int matchLength = dictMatch.getMatchLength();
            StringBuffer buffer = new StringBuffer();
            for (int i = 0; i < matchLength; i++) {
               buffer.append(annotFSs[currentPos + i].getCoveredText());
               buffer.append(" ");
            }
            matches.add(buffer.toString().trim());
            // adjust current token position in case of multi word match
            currentPos = currentPos + matchLength;
         } else {
            // no match found, go to the next token
            currentPos++;
         }
      }
   }


   /**
    * tests the dictionary matching for single words and multi words.
    * 
    * @throws Exception
    */
   public void testDictionaryMatchingOutsideAnnotator() throws Exception {


      // create the dictionary
      File dictFile = JUnitExtension
            .getFile("DictionaryMatchTests/MultiWords.xml");
      InputStream stream = new BufferedInputStream(
            new FileInputStream(dictFile));


      DictionaryBuilder dictBuilder = new HashMapDictionaryBuilder();
      // create dictionary file parser
      DictionaryFileParser fileParser = new DictionaryFileParserImpl();
      fileParser.parseDictionaryFile(dictFile.getAbsolutePath(), stream,
            dictBuilder);


      Dictionary dict = dictBuilder.getDictionary();


      // -- read input XCAS and create a CAS --


      // read type system file
      File typeSystemFile = JUnitExtension
            .getFile("DictionaryMatchTests/Token.xml");
      // get XCAS file
      File xcasFile = JUnitExtension.getFile("DictionaryMatchTests/Token.xcas");


      // parse type system file
      Object descriptor = UIMAFramework.getXMLParser().parse(
            new XMLInputSource(typeSystemFile));
      TypeSystemDescription tsDesc = (TypeSystemDescription) descriptor;


      // create a CAS and add XCAS content
      CAS cas = CasCreationUtils.createCas(tsDesc, null,
            new FsIndexDescription[0]);
      SAXParser parser = SAXParserFactory.newInstance().newSAXParser();
      XCASDeserializer xcasDeserializer = new XCASDeserializer(cas
            .getTypeSystem());
      parser.parse(xcasFile, xcasDeserializer.getXCASHandler(cas));


      // get dictionary match input type
      Type inputType = cas.getTypeSystem().getType(
            "org.apache.uima.TokenAnnotation");
      Assert
            .assertNotNull("Type org.apache.uima.TokenAnnotation not found in the type system"
                  + inputType);


      // copy input match type annotations to an array
      FSIterator it = cas.getAnnotationIndex(inputType).iterator();
      ArrayList<AnnotationFS> inputTypeAnnots = new ArrayList<AnnotationFS>();
      while (it.hasNext()) {
         inputTypeAnnots.add((AnnotationFS) it.next());
      }
      AnnotationFS[] annotFSs = inputTypeAnnots.toArray(new AnnotationFS[] {});


      // check matches for the CAS
      ArrayList<String> matches = new ArrayList<String>();
      match(dict, annotFSs, new FeaturePathInfo_impl(), matches);


      // check match results
      Assert.assertEquals("new", matches.get(0));
      Assert.assertEquals("new york", matches.get(1));
      Assert.assertEquals("new orleans", matches.get(2));
      Assert.assertEquals("new york city", matches.get(3));
   }
}
Source Code of org.apache.uima.annotator.dict_annot.dictionary.DictionaryMatcherTest

Related Classes of org.apache.uima.annotator.dict_annot.dictionary.DictionaryMatcherTest