Package org.apache.uima.annotator.dict_annot.dictionary

Source Code of org.apache.uima.annotator.dict_annot.dictionary.DictionaryMatcherTest

/*
* Licensed to the Apache Software Foundation (ASF) under one
* or more contributor license agreements.  See the NOTICE file
* distributed with this work for additional information
* regarding copyright ownership.  The ASF licenses this file
* to you under the Apache License, Version 2.0 (the
* "License"); you may not use this file except in compliance
* with the License.  You may obtain a copy of the License at
*
*   http://www.apache.org/licenses/LICENSE-2.0
*
* Unless required by applicable law or agreed to in writing,
* software distributed under the License is distributed on an
* "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
* KIND, either express or implied.  See the License for the
* specific language governing permissions and limitations
* under the License.
*/
package org.apache.uima.annotator.dict_annot.dictionary;

import java.io.BufferedInputStream;
import java.io.File;
import java.io.FileInputStream;
import java.io.InputStream;
import java.util.ArrayList;

import javax.xml.parsers.SAXParser;
import javax.xml.parsers.SAXParserFactory;

import junit.framework.Assert;
import junit.framework.TestCase;

import org.apache.uima.UIMAFramework;
import org.apache.uima.annotator.dict_annot.dictionary.impl.DictionaryFileParserImpl;
import org.apache.uima.annotator.dict_annot.dictionary.impl.FeaturePathInfo;
import org.apache.uima.annotator.dict_annot.dictionary.impl.HashMapDictionaryBuilder;
import org.apache.uima.annotator.dict_annot.impl.FeaturePathInfo_impl;
import org.apache.uima.cas.CAS;
import org.apache.uima.cas.FSIterator;
import org.apache.uima.cas.Type;
import org.apache.uima.cas.impl.XCASDeserializer;
import org.apache.uima.cas.text.AnnotationFS;
import org.apache.uima.resource.metadata.FsIndexDescription;
import org.apache.uima.resource.metadata.TypeSystemDescription;
import org.apache.uima.test.junit_extension.JUnitExtension;
import org.apache.uima.util.CasCreationUtils;
import org.apache.uima.util.XMLInputSource;

/**
* Tests if the dictionary matches works correctly.
*/
public class DictionaryMatcherTest extends TestCase {

   /**
    * Test matcher that takes the text and the dictionary and adds all matches
    * to the array list.
    *
    * @param dict
    *           dictionary to use
    * @param tokens
    *           tokenized string
    * @param matches
    *           match list
    */
   public void match(Dictionary dict, AnnotationFS[] annotFSs, FeaturePathInfo featPathInfo,
         ArrayList<String> matches) {
      int currentPos = 0;
      while (currentPos < annotFSs.length) {

         DictionaryMatch dictMatch = dict.matchEntry(currentPos, annotFSs, featPathInfo);
         if (dictMatch != null) {
            // we have found a match starting at currentPos
            int matchLength = dictMatch.getMatchLength();
            StringBuffer buffer = new StringBuffer();
            for (int i = 0; i < matchLength; i++) {
               buffer.append(annotFSs[currentPos + i].getCoveredText());
               buffer.append(" ");
            }
            matches.add(buffer.toString().trim());
            // adjust current token position in case of multi word match
            currentPos = currentPos + matchLength;
         } else {
            // no match found, go to the next token
            currentPos++;
         }
      }
   }

   /**
    * tests the dictionary matching for single words and multi words.
    *
    * @throws Exception
    */
   public void testDictionaryMatchingOutsideAnnotator() throws Exception {

      // create the dictionary
      File dictFile = JUnitExtension
            .getFile("DictionaryMatchTests/MultiWords.xml");
      InputStream stream = new BufferedInputStream(
            new FileInputStream(dictFile));

      DictionaryBuilder dictBuilder = new HashMapDictionaryBuilder();
      // create dictionary file parser
      DictionaryFileParser fileParser = new DictionaryFileParserImpl();
      fileParser.parseDictionaryFile(dictFile.getAbsolutePath(), stream,
            dictBuilder);

      Dictionary dict = dictBuilder.getDictionary();

      // -- read input XCAS and create a CAS --

      // read type system file
      File typeSystemFile = JUnitExtension
            .getFile("DictionaryMatchTests/Token.xml");
      // get XCAS file
      File xcasFile = JUnitExtension.getFile("DictionaryMatchTests/Token.xcas");

      // parse type system file
      Object descriptor = UIMAFramework.getXMLParser().parse(
            new XMLInputSource(typeSystemFile));
      TypeSystemDescription tsDesc = (TypeSystemDescription) descriptor;

      // create a CAS and add XCAS content
      CAS cas = CasCreationUtils.createCas(tsDesc, null,
            new FsIndexDescription[0]);
      SAXParser parser = SAXParserFactory.newInstance().newSAXParser();
      XCASDeserializer xcasDeserializer = new XCASDeserializer(cas
            .getTypeSystem());
      parser.parse(xcasFile, xcasDeserializer.getXCASHandler(cas));

      // get dictionary match input type
      Type inputType = cas.getTypeSystem().getType(
            "org.apache.uima.TokenAnnotation");
      Assert
            .assertNotNull("Type org.apache.uima.TokenAnnotation not found in the type system"
                  + inputType);

      // copy input match type annotations to an array
      FSIterator it = cas.getAnnotationIndex(inputType).iterator();
      ArrayList<AnnotationFS> inputTypeAnnots = new ArrayList<AnnotationFS>();
      while (it.hasNext()) {
         inputTypeAnnots.add((AnnotationFS) it.next());
      }
      AnnotationFS[] annotFSs = inputTypeAnnots.toArray(new AnnotationFS[] {});

      // check matches for the CAS
      ArrayList<String> matches = new ArrayList<String>();
      match(dict, annotFSs, new FeaturePathInfo_impl(), matches);

      // check match results
      Assert.assertEquals("new", matches.get(0));
      Assert.assertEquals("new york", matches.get(1));
      Assert.assertEquals("new orleans", matches.get(2));
      Assert.assertEquals("new york city", matches.get(3));
   }
}
TOP

Related Classes of org.apache.uima.annotator.dict_annot.dictionary.DictionaryMatcherTest

TOP
Copyright © 2018 www.massapi.com. All rights reserved.
All source code are property of their respective owners. Java is a trademark of Sun Microsystems, Inc and owned by ORACLE Inc. Contact coftware#gmail.com.