Source Code of org.apache.uima.examples.tagger.test.TaggerTest

/*
 *Licensed to the Apache Software Foundation (ASF) under one
 * or more contributor license agreements.  See the NOTICE file
 * distributed with this work for additional information
 * regarding copyright ownership.  The ASF licenses this file
 * to you under the Apache License, Version 2.0 (the
 * "License"); you may not use this file except in compliance
 * with the License.  You may obtain a copy of the License at
 * 
 *   http://www.apache.org/licenses/LICENSE-2.0
 * 
 * Unless required by applicable law or agreed to in writing,
 * software distributed under the License is distributed on an
 * "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
 * KIND, either express or implied.  See the License for the
 * specific language governing permissions and limitations
 * under the License.
 * 
 */
package org.apache.uima.examples.tagger.test;


import java.io.BufferedReader;
import java.io.BufferedWriter;
import java.io.File;
import java.io.FileInputStream;
import java.io.FileNotFoundException;
import java.io.FileOutputStream;
import java.io.IOException;
import java.io.InputStreamReader;
import java.io.OutputStreamWriter;
import java.io.UnsupportedEncodingException;
import java.io.Writer;
import java.net.URL;
import java.util.ArrayList;
import java.util.Arrays;
import java.util.Collections;
import java.util.Iterator;
import java.util.List;
import java.util.Map;
import java.util.Map.Entry;


import junit.framework.TestCase;


import org.apache.uima.TokenAnnotation;
import org.apache.uima.UIMAFramework;
import org.apache.uima.analysis_engine.AnalysisEngine;
import org.apache.uima.cas.FSIterator;
import org.apache.uima.cas.text.AnnotationIndex;
import org.apache.uima.examples.tagger.HMMTagger;
import org.apache.uima.examples.tagger.Viterbi;
import org.apache.uima.jcas.JCas;
import org.apache.uima.test.junit_extension.JUnitExtension;
import org.apache.uima.util.FileUtils;
import org.apache.uima.util.XMLInputSource;
import org.apache.uima.util.XMLParser;


//This test was run with JUnit3


public class TaggerTest extends TestCase {


  private HMMTagger hmm;


  private List<String> sent; // sentence


  private List<String> gold_standard;


  private List<String> tagger_output;


  /**
   * Set up the test fixture
   */


  protected void setUp() {


    hmm = new HMMTagger();
    gold_standard = new ArrayList<String>();
    sent = new ArrayList<String>();
    tagger_output = new ArrayList<String>();


  }


  /**
   * Tests tagging for German.
   * 
   */
  @SuppressWarnings("unchecked")
  public void testGermanTagger() {


    System.out.println("Testing German Model... ");
    List POS = new ArrayList();


    try {
      hmm.my_model = HMMTagger.get_model("resources/german/TuebaModel.dat");
    } catch (Exception e) {
      System.out.println("Model which is supposed to be used for testing does not exist");
    }
    System.out.println(hmm.my_model.word_probs.size() + " distinct words in the model");


    Iterator<Entry<String, Map<String, Double>>> keyValuePairs = hmm.my_model.word_probs.entrySet()
        .iterator(); // iterate over words


    for (int i = 0; i < hmm.my_model.word_probs.size(); i++) {
      Map.Entry<String, Map<String, Double>> entry = (Map.Entry<String, Map<String, Double>>) keyValuePairs
          .next();
      Object key = entry.getKey();
      Map<String, Double> pos = (Map) hmm.my_model.word_probs.get(key); // map of possible pos-s of
      // the word
      Object[] pos_s = pos.entrySet().toArray(); // for iteration over possible pos_s


      for (int u = 0; u < pos_s.length; u++) {


        Map.Entry<String, Map<String, Double>> entry2 = (Map.Entry<String, Map<String, Double>>) pos_s[u];
        Object key2 = entry2.getKey(); // pos of a word
        if (POS.contains(key2)) {
          continue;


        } else {
          POS.add(key2);
        }
      }


    }
    Collections.sort(POS);
    System.out.println("Number of part-of-speech tags used: " + POS.size());
    System.out.println("These are:  " + POS);


    System.out.println("Testing German trigram tagger..");


    sent.add("Jerry");
    sent.add("liebt");
    sent.add("Wansley");
    sent.add(".");


    System.out.println(sent);


    hmm.N = 3;
    // hmm.END_OF_SENT_TAG = "$.";


    String[] out = new String[] { "NE", "VVFIN", "NE", "$." };
    gold_standard.addAll(Arrays.asList(out));
    tagger_output = Viterbi.process(hmm.N, sent, hmm.my_model.suffix_tree,
        hmm.my_model.suffix_tree_capitalized, hmm.my_model.transition_probs,
        hmm.my_model.word_probs, hmm.my_model.lambdas2, hmm.my_model.lambdas3, hmm.my_model.theta);
    System.out.println("expected: " + gold_standard);
    System.out.println("tagger output: " + tagger_output);
  }


  /**
   * Tests English trigram tagger
   * 
   */
  @SuppressWarnings("unchecked")
  public void testEnglishTagger() {


    System.out.println("Tesing English Model... ");
    List POS = new ArrayList();


    try {
      hmm.my_model = HMMTagger.get_model("resources/english/BrownModel.dat");
    } catch (Exception e) {
      System.out.println("Model which is supposed to be used for testing does not exist");
    }
    System.out.println(hmm.my_model.word_probs.size() + " distinct words in the model");


    Iterator<Entry<String, Map<String, Double>>> keyValuePairs = hmm.my_model.word_probs.entrySet()
        .iterator(); // iterate over words


    for (int i = 0; i < hmm.my_model.word_probs.size(); i++) {
      Map.Entry<String, Map<String, Double>> entry = (Map.Entry<String, Map<String, Double>>) keyValuePairs
          .next();
      Object key = entry.getKey();
      Map<String, Double> pos = (Map) hmm.my_model.word_probs.get(key); // map of possible pos-s of
      // the word
      Object[] pos_s = pos.entrySet().toArray(); // for iteration over possible pos_s


      for (int u = 0; u < pos_s.length; u++) {


        Map.Entry<String, Map<String, Double>> entry2 = (Map.Entry<String, Map<String, Double>>) pos_s[u];
        Object key2 = entry2.getKey(); // pos of a word
        if (POS.contains(key2)) {
          continue;


        } else {
          POS.add(key2);
        }
      }


    }
    Collections.sort(POS);
    System.out.println("Number of part-of-speech tags used: " + POS.size());
    System.out.println("These are:  " + POS);


    System.out.println("Testing English trigram tagger...");


    sent.add("Jerry");
    sent.add("loves");
    sent.add("Wansley");
    sent.add(".");


    System.out.println(sent);


    hmm.N = 3;
    // hmm.END_OF_SENT_TAG = "$.";


    String[] out = new String[] { "np", "vbz", "np", "." };
    gold_standard.addAll(Arrays.asList(out));
    tagger_output = Viterbi.process(hmm.N, sent, hmm.my_model.suffix_tree,
        hmm.my_model.suffix_tree_capitalized, hmm.my_model.transition_probs,
        hmm.my_model.word_probs, hmm.my_model.lambdas2, hmm.my_model.lambdas3, hmm.my_model.theta);
    System.out.println("expected: " + gold_standard);
    System.out.println("tagger output: " + tagger_output);
  }


  /**
   * Run tagger on Moby Dick and compare result to pre-computed XCAS. This test case is a bit
   * brittle. First of all, it requires the uimaj-core project as a neighbor, otherwise it won't
   * find moby.txt, the test input.  Secondly, the reference output is only a list of POS tags.
   * This is so the file doesn't get so big.  Finally, if anything changes, even improvements, this
   * test will most likely fail and will need to be updated.  There's an unused method below that
   * can be used to write a new set of POS tags when the old ones become obsolete.
   */
  public void testMobyDick() {
    try {
      XMLParser xmlParser = UIMAFramework.getXMLParser();
      XMLInputSource xmlInputSource = new XMLInputSource("desc/HmmTaggerAggregate.xml");
      AnalysisEngine taggerEngine = UIMAFramework.produceAnalysisEngine(xmlParser
          .parseResourceSpecifier(xmlInputSource));
      String text = null;
      try {
        text = FileUtils.file2String(new File("../uimaj-core/src/test/resources/data/moby.txt"),
            "utf-8");
      } catch (FileNotFoundException e) {
        System.err
            .println("Warning: this test case runs only if uimaj-core is a sister project to the tagger project; otherwise, the test input data is not available.");
        System.err.println("Skipping test");
        return;
      }
      JCas cas = taggerEngine.newJCas();
      cas.setDocumentText(text);
      taggerEngine.process(cas);
      List<String> savedTags = readSavedTagList();
      List<String> currentTags = getCurrentTagList(cas);
      assertTrue("List of tags is not the same length", savedTags.size() == currentTags.size());
      for (int i = 0; i < savedTags.size(); i++) {
        assertEquals("Different tags at position " + i, savedTags.get(i), currentTags.get(i));
      }
    } catch (Exception e) {
      e.printStackTrace();
      assertTrue(false);
    }
  }


  private List<String> readSavedTagList() throws IOException {
    List<String> tags = new ArrayList<String>();
    BufferedReader reader = new BufferedReader(new InputStreamReader(new FileInputStream(
        "src/test/resources/moby-tag-list.txt"), "utf-8"));
    String tag = null;
    while ((tag = reader.readLine()) != null) {
      tags.add(tag);
    }
    return tags;
  }


  private List<String> getCurrentTagList(JCas cas) {
    List<String> tagList = new ArrayList<String>();
    AnnotationIndex tokenIndex = cas.getAnnotationIndex(TokenAnnotation.type);
    FSIterator tokIt = tokenIndex.iterator();
    TokenAnnotation token = null;
    for (tokIt.moveToFirst(); tokIt.isValid(); tokIt.moveToNext()) {
      token = (TokenAnnotation) tokIt.get();
      tagList.add(token.getPosTag());
    }
    return tagList;
  }


  /**
   * @param cas
   * @throws IOException
   * @throws UnsupportedEncodingException
   */
  private void printPosTags(JCas cas) throws UnsupportedEncodingException, IOException {
    Writer writer = new BufferedWriter(new OutputStreamWriter(new FileOutputStream(
        "moby-tag-list.txt"), "utf-8"));
    AnnotationIndex tokenIndex = cas.getAnnotationIndex(TokenAnnotation.type);
    FSIterator tokIt = tokenIndex.iterator();
    TokenAnnotation token = null;
    for (tokIt.moveToFirst(); tokIt.isValid(); tokIt.moveToNext()) {
      token = (TokenAnnotation) tokIt.get();
      writer.write(token.getPosTag());
      writer.write('\n');
    }
    writer.close();
  }
}
Source Code of org.apache.uima.examples.tagger.test.TaggerTest

Related Classes of org.apache.uima.examples.tagger.test.TaggerTest