Source Code of org.apache.uima.cas.test.SerializationNoMDTest

/*
 * Licensed to the Apache Software Foundation (ASF) under one
 * or more contributor license agreements.  See the NOTICE file
 * distributed with this work for additional information
 * regarding copyright ownership.  The ASF licenses this file
 * to you under the Apache License, Version 2.0 (the
 * "License"); you may not use this file except in compliance
 * with the License.  You may obtain a copy of the License at
 * 
 *   http://www.apache.org/licenses/LICENSE-2.0
 * 
 * Unless required by applicable law or agreed to in writing,
 * software distributed under the License is distributed on an
 * "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
 * KIND, either express or implied.  See the License for the
 * specific language governing permissions and limitations
 * under the License.
 */


package org.apache.uima.cas.test;


import java.io.BufferedReader;
import java.io.File;
import java.io.FileReader;
import java.io.IOException;
import java.io.StringReader;
import java.util.ArrayList;
import java.util.Iterator;


import junit.framework.TestCase;


import org.apache.uima.cas.CAS;
import org.apache.uima.cas.CASException;
import org.apache.uima.cas.FSIndex;
import org.apache.uima.cas.FSIndexRepository;
import org.apache.uima.cas.FSIterator;
import org.apache.uima.cas.Feature;
import org.apache.uima.cas.FeatureStructure;
import org.apache.uima.cas.Type;
import org.apache.uima.cas.TypeSystem;
import org.apache.uima.cas.admin.CASFactory;
import org.apache.uima.cas.admin.CASMgr;
import org.apache.uima.cas.admin.TypeSystemMgr;
import org.apache.uima.cas.impl.CASImpl;
import org.apache.uima.cas.impl.CASSerializer;
import org.apache.uima.cas.impl.Serialization;
import org.apache.uima.cas.text.AnnotationFS;
import org.apache.uima.internal.util.TextStringTokenizer;
import org.apache.uima.resource.ResourceInitializationException;
import org.apache.uima.resource.metadata.TypeSystemDescription;
import org.apache.uima.test.junit_extension.JUnitExtension;
import org.apache.uima.util.CasCreationUtils;


/**
 * Class comment for TokenizerTest.java goes here.
 * 
 */
public class SerializationNoMDTest extends TestCase {


  public static final String TOKEN_TYPE = "Token";


  public static final String TOKEN_TYPE_FEAT = "type";


  public static final String TOKEN_TYPE_FEAT_Q = TOKEN_TYPE + TypeSystem.FEATURE_SEPARATOR
          + TOKEN_TYPE_FEAT;


  public static final String TOKEN_TYPE_TYPE = "TokenType";


  public static final String WORD_TYPE = "Word";


  public static final String SEP_TYPE = "Separator";


  public static final String EOS_TYPE = "EndOfSentence";


  public static final String SENT_TYPE = "Sentence";


  private CASMgr casMgr;


  private CAS cas;


  private Type wordType;


  private Type separatorType;


  private Type eosType;


  private Type tokenType;


  private Feature tokenTypeFeature;


  private Type sentenceType;


  private Feature startFeature;


  private Feature endFeature;


  public SerializationNoMDTest(String arg) {
    super(arg);
  }


  /**
   * @see junit.framework.TestCase#setUp()
   */
  public void setUp() throws Exception {
    super.setUp();
    casMgr = initCAS();
    cas = (CASImpl)casMgr;


    TypeSystem ts = cas.getTypeSystem();
    wordType = ts.getType(WORD_TYPE);
    // assert(wordType != null);
    separatorType = ts.getType(SEP_TYPE);
    eosType = ts.getType(EOS_TYPE);
    tokenType = ts.getType(TOKEN_TYPE);
    tokenTypeFeature = ts.getFeatureByFullName(TOKEN_TYPE_FEAT_Q);
    startFeature = ts.getFeatureByFullName(CAS.FEATURE_FULL_NAME_BEGIN);
    endFeature = ts.getFeatureByFullName(CAS.FEATURE_FULL_NAME_END);
    sentenceType = ts.getType(SENT_TYPE);
  }


  public void tearDown() {
    casMgr = null;
    cas = null;
    wordType = null;
    separatorType = null;
    eosType = null;
    tokenType = null;
    tokenTypeFeature = null;
    startFeature = null;
    endFeature = null;
    sentenceType = null;


  }


  // Initialize the first CAS.
  private static CASMgr initCAS() throws CASException {
    // Create an initial CASMgr from the factory.
    // CASMgr cas = CASFactory.createCAS();
    // assert(tsa != null);
    // Create a CASMgr. Ensures existence of AnnotationFS type.
    // CASMgr tcas = CASFactory.createCAS();
    CASMgr aCas = CASFactory.createCAS();
    try {
      CasCreationUtils.setupTypeSystem(aCas, (TypeSystemDescription) null);
    } catch (ResourceInitializationException e) {
      e.printStackTrace();
    }
    // Create a writable type system.
    TypeSystemMgr tsa = aCas.getTypeSystemMgr();
    // Add new types and features.
    Type topType = tsa.getTopType();
    Type annotType = tsa.getType(CAS.TYPE_NAME_ANNOTATION);
    // assert(annotType != null);
    tsa.addType(SENT_TYPE, annotType);
    Type tokenType = tsa.addType(TOKEN_TYPE, annotType);
    Type tokenTypeType = tsa.addType(TOKEN_TYPE_TYPE, topType);
    tsa.addType(WORD_TYPE, tokenTypeType);
    tsa.addType(SEP_TYPE, tokenTypeType);
    tsa.addType(EOS_TYPE, tokenTypeType);
    tsa.addFeature(TOKEN_TYPE_FEAT, tokenType, tokenTypeType);
    // Commit the type system.
    ((CASImpl) aCas).commitTypeSystem();
    // assert(tsa.isCommitted());
    // // Create the CAS indexes.
    // tcas.initCASIndexes();
    // Create the Base indexes.
    try {
      aCas.initCASIndexes();
    } catch (CASException e) {
      e.printStackTrace();
    }


    // Commit the index repository.
    aCas.getIndexRepositoryMgr().commit();
    // assert(cas.getIndexRepositoryMgr().isCommitted());


    // Create the default text Sofa and return CAS view
    return (CASMgr) aCas.getCAS().getCurrentView();
  }


  // Tokenize text.
  private void tokenize() throws Exception {
    // System.out.println("Tokenizing text.");


    // Create FSs for the token types.
    FeatureStructure wordFS = cas.createFS(wordType);
    FeatureStructure sepFS = cas.createFS(separatorType);
    FeatureStructure eosFS = cas.createFS(eosType);


    String text = cas.getDocumentText();
    TextStringTokenizer tokenizer = new TextStringTokenizer(text);
    tokenizer.setSeparators("/-*&@");
    tokenizer.addWhitespaceChars(",");
    tokenizer.setEndOfSentenceChars(".!?");
    tokenizer.setShowWhitespace(false);
    int tokenTypeCode;
    int wordCounter = 0;
    int sepCounter = 0;
    int endOfSentenceCounter = 0;
    AnnotationFS tokenAnnot;
    while (tokenizer.isValid()) {
      tokenAnnot = cas.createAnnotation(tokenType, tokenizer.getTokenStart(), tokenizer
              .getTokenEnd());
      tokenTypeCode = tokenizer.getTokenType();
      switch (tokenTypeCode) {
        case TextStringTokenizer.EOS: {
          ++endOfSentenceCounter;
          tokenAnnot.setFeatureValue(tokenTypeFeature, eosFS);
          break;
        }
        case TextStringTokenizer.SEP: {
          ++sepCounter;
          tokenAnnot.setFeatureValue(tokenTypeFeature, sepFS);
          break;
        }
        case TextStringTokenizer.WSP: {
          break;
        }
        case TextStringTokenizer.WCH: {
          ++wordCounter;
          tokenAnnot.setFeatureValue(tokenTypeFeature, wordFS);
          // if ((wordCounter % 100000) == 0) {
          // System.out.println("Number of words tokenized: " + wordCounter);
          // }
          break;
        }
        default: {
          throw new Exception("Something went wrong, fire up that debugger!");
        }
      }
      cas.getIndexRepository().addFS(tokenAnnot);
      tokenizer.setToNext();
      // System.out.println("Token: " + tokenizer.nextToken());
    }
    // time = System.currentTimeMillis() - time;
    // System.out.println("Number of words: " + wordCounter);
    // int allTokens = wordCounter + sepCounter + endOfSentenceCounter;
    // System.out.println("Number of tokens: " + allTokens);
    // System.out.println("Time used: " + new TimeSpan(time));


    // FSIterator it = cas.getAnnotationIndex(tokenType).iterator();
    // int count = 0;
    // while (it.isValid()) {
    // ++count;
    // it.moveToNext();
    // }
    // System.out.println("Number of tokens in index: " + count);
  }


  // Very (!) primitive EOS detection.
  private void createSentences() throws CASException {
    // TypeSystem ts = cas.getTypeSystem();
    // Type eosType = ts.getType(EOS_TYPE);
    // Type tokenType = ts.getType(TOKEN_TYPE);
    // //assert(tokenType != null);
    // Type sentenceType = ts.getType(SENT_TYPE);
    // Feature tokenTypeFeature = ts.getFeature(TOKEN_TYPE_FEAT);
    // Feature startFeature = ts.getFeature(CAS.START_FEAT);
    // Feature endFeature = ts.getFeature(CAS.END_FEAT);


    // System.out.println("\nCreating sentence annotations.");


    // Get a handle to the index repository.
    FSIndexRepository indexRepository = cas.getIndexRepository();
    // assert(indexRepository != null);
    Iterator labelIt = indexRepository.getLabels();
    assertTrue(labelIt != null);
    // Get the standard index for tokens.
    FSIndex tokenIndex = cas.getAnnotationIndex(tokenType);
    // assert(tokenIndex != null);
    // Get an iterator over tokens.
    FSIterator it = tokenIndex.iterator();
    // assert(it != null);
    // Now create sentences. We do this as follows: a sentence starts where
    // the first token after an EOS starts, and ends with an EOS.
    long time = System.currentTimeMillis();
    int endOfSentenceCounter = 0;
    it.moveToFirst();
    boolean lookForStart = true;
    int start = 0, end; // Initialize start to pacify compiler.
    FeatureStructure tokenFS, sentFS;
    while (it.isValid()) {
      if (lookForStart) {
        // If we're looking for the start of a sentence, just grab the start
        // of the current FS.
        start = it.get().getIntValue(startFeature);
        lookForStart = false;
      } else {
        // Check if we've reached the end of a sentence.
        tokenFS = it.get();
        if (tokenFS.getFeatureValue(tokenTypeFeature).getType() == eosType) {
          end = tokenFS.getIntValue(endFeature);
          sentFS = cas.createFS(sentenceType);
          sentFS.setIntValue(startFeature, start);
          sentFS.setIntValue(endFeature, end);
          cas.getIndexRepository().addFS(sentFS);
          ++endOfSentenceCounter;
          lookForStart = true;
        }
      }
      it.moveToNext();
    }
    time = System.currentTimeMillis() - time;
    // System.out.println("Created " + endOfSentenceCounter + " sentences: " + new TimeSpan(time));
  }


  // Check results.
  private void checkSentences() throws CASException, CASException {
    TypeSystem ts = cas.getTypeSystem();
    Type localSentenceType = ts.getType(SENT_TYPE);
    // Feature tokenTypeFeature = ts.getFeatureByFullName(TOKEN_TYPE_FEAT);
    // Feature startFeature = ts.getFeatureByFullName(CAS.FEATURE_BASE_NAME_BEGIN);
    // Feature endFeature = ts.getFeatureByFullName(CAS.FEATURE_BASE_NAME_END);


    // Print the first few sentences.
    // System.out.println("\nThe first 10 sentences:\n");
    FSIndex sentenceIndex = cas.getAnnotationIndex(localSentenceType);
    FSIterator it = sentenceIndex.iterator();
    AnnotationFS sentFS;
    if (it.isValid()) {
      sentFS = (AnnotationFS) it.get();
      assertTrue(sentFS.getCoveredText() != null);
    }
    // int counter = 0;
    String text = cas.getDocumentText();
    assertTrue(text != null);
    // while (it.isValid() && counter < 10) {
    // sentFS = (AnnotationFS)it.get();
    // System.out.println(
    // "Sentence: "
    // + sentFS.getCoveredText());
    // it.moveToNext();
    // ++counter;
    // }


    // Now get an iterator over all annotations.
    FSIndex annotIndex = cas.getAnnotationIndex();
    // System.out.println("\nNumber of annotations in index: " + annotIndex.size());


    // Print the first few sentences.
    // System.out.println("The first 50 annotations:\n");


    it = annotIndex.iterator();
    // assert(it.isValid());
    // counter = 0;
    // AnnotationFS fs;
    // while (it.isValid() && counter < 50) {
    // fs = (AnnotationFS)it.get();
    // System.out.print(fs.getType().getName() + ": ");
    // if (fs.getType().getName().equals(CASMgr.DOCUMENT_TYPE)) {
    // // When we see the document, we don't print the whole text ;-)
    // System.out.println("...");
    // } else {
    // System.out.println(
    // fs.getCoveredText());
    // }
    // it.moveToNext();
    // ++counter;
    // }
  }


  // private static String file2String(String file) throws IOException {
  // return file2String(new File(file));
  // }


  /**
   * Read the contents of a file into a string, using the default platform encoding.
   * 
   * @param file
   *          The file to be read in.
   * @return String The contents of the file.
   * @throws IOException
   *           Various I/O errors.
   */
  public static String file2String(File file) throws IOException {
    // Read the file into a string using a char buffer.
    FileReader reader = null;
    int bufSize = (int) file.length(); // length in bytes >= length in chars due to encoding
    char[] buf = new char[bufSize];
    int read_so_far = 0;
    try {
      reader = new FileReader(file);  
      while (read_so_far < bufSize) {
        int count = reader.read(buf, read_so_far, bufSize - read_so_far);
        if (count < 0) {
          break;
        }
        read_so_far += count;
      }


    } finally {
      if (null != reader)
        reader.close();
    }
    return new String(buf, 0, read_so_far);    
  }


  /**
   * Test driver.
   */
  public void testMain() throws Exception {


    // Read the document into a String. I'm sure there are better ways to
    // do this.
    File textFile = JUnitExtension.getFile("data/moby.txt");
    String moby = file2String(textFile);
    // String moby = file2String(System.getProperty("cas.data.test") + "moby.txt");
    String line;
    BufferedReader br = new BufferedReader(new StringReader(moby));
    StringBuffer buf = new StringBuffer();
    ArrayList docs = new ArrayList();
    while ((line = br.readLine()) != null) {
      if (line.startsWith(".. <p")) {
        docs.add(buf.toString());
        buf = new StringBuffer();
      } else {
        buf.append(line + "\n");
      }
    }
    docs.add(buf.toString());
    buf = null;


    final int numDocs = docs.size();
    final int max = 30;
    int docCount = 0;
    long overallTime = System.currentTimeMillis();
    int numTok, numSent;
    CASSerializer cs;
    while (docCount < max) {
      for (int i = 0; i < numDocs && docCount < max; i++) {
        // System.out.println("Processing document: " + i);
        // Set document text in first CAS.
        cas.setDocumentText((String) docs.get(i));


        tokenize();
        numTok = cas.getAnnotationIndex(tokenType).size();
        assertTrue(numTok > 0);
        // System.out.println(" Number of tokens: " + numTok);


        // System.out.println("Serializing...");
        cs = Serialization.serializeNoMetaData(cas);
        cas = Serialization.createCAS(casMgr, cs);


        assertTrue(numTok == cas.getAnnotationIndex(tokenType).size());


        createSentences();
        numSent = cas.getAnnotationIndex(sentenceType).size();
        assertTrue(numSent > 0);
        // System.out.println(" Number of sentences: " + numSent);


        // System.out.println("Serializing...");
        cs = Serialization.serializeNoMetaData(cas);
        cas = Serialization.createCAS(casMgr, cs);


        assertTrue(numTok == cas.getAnnotationIndex(tokenType).size());
        assertTrue(numSent == cas.getAnnotationIndex(sentenceType).size());
        // System.out.println(" Number of tokens: " + numTok);
        checkSentences();


        // System.out.println("Serializing...");
        cs = Serialization.serializeNoMetaData(cas);
        cas = Serialization.createCAS(casMgr, cs);


        assertTrue(numTok == cas.getAnnotationIndex(tokenType).size());
        assertTrue(numSent == cas.getAnnotationIndex(sentenceType).size());
        // System.out.println(" Verify: " + numTok + " tokens, " + numSent + " sentences.");


        casMgr.reset();


        ++docCount;
      }
      // System.out.println("Number of documents processed: " + docCount);
    }
    overallTime = System.currentTimeMillis() - overallTime;
    // System.out.println("Time taken over all: " + new TimeSpan(overallTime));


  }


  public static void main(String[] args) {
    junit.textui.TestRunner.run(SerializationNoMDTest.class);
  }


}
Source Code of org.apache.uima.cas.test.SerializationNoMDTest

Related Classes of org.apache.uima.cas.test.SerializationNoMDTest