Source Code of org.apache.stanbol.enhancer.nlp.json.AnalyzedTextSerializerAndParserTest

/*
* Licensed to the Apache Software Foundation (ASF) under one or more
* contributor license agreements.  See the NOTICE file distributed with
* this work for additional information regarding copyright ownership.
* The ASF licenses this file to You under the Apache License, Version 2.0
* (the "License"); you may not use this file except in compliance with
* the License.  You may obtain a copy of the License at
*
*     http://www.apache.org/licenses/LICENSE-2.0
*
* Unless required by applicable law or agreed to in writing, software
* distributed under the License is distributed on an "AS IS" BASIS,
* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
* See the License for the specific language governing permissions and
* limitations under the License.
*/
package org.apache.stanbol.enhancer.nlp.json;


import static org.apache.stanbol.enhancer.servicesapi.rdf.OntologicalClasses.DBPEDIA_ORGANISATION;


import java.io.ByteArrayInputStream;
import java.io.ByteArrayOutputStream;
import java.io.IOException;
import java.nio.charset.Charset;
import java.util.Collections;
import java.util.EnumSet;
import java.util.Iterator;
import java.util.LinkedHashMap;
import java.util.List;
import java.util.Set;
import java.util.Map.Entry;


import org.apache.clerezza.rdf.core.UriRef;
import org.apache.stanbol.enhancer.contentitem.inmemory.InMemoryContentItemFactory;
import org.apache.stanbol.enhancer.nlp.NlpAnnotations;
import org.apache.stanbol.enhancer.nlp.model.AnalysedText;
import org.apache.stanbol.enhancer.nlp.model.AnalysedTextFactory;
import org.apache.stanbol.enhancer.nlp.model.Chunk;
import org.apache.stanbol.enhancer.nlp.model.Sentence;
import org.apache.stanbol.enhancer.nlp.model.Span;
import org.apache.stanbol.enhancer.nlp.model.Token;
import org.apache.stanbol.enhancer.nlp.model.Span.SpanTypeEnum;
import org.apache.stanbol.enhancer.nlp.model.annotation.Annotation;
import org.apache.stanbol.enhancer.nlp.model.annotation.Value;
import org.apache.stanbol.enhancer.nlp.morpho.Case;
import org.apache.stanbol.enhancer.nlp.morpho.CaseTag;
import org.apache.stanbol.enhancer.nlp.morpho.Definitness;
import org.apache.stanbol.enhancer.nlp.morpho.Gender;
import org.apache.stanbol.enhancer.nlp.morpho.GenderTag;
import org.apache.stanbol.enhancer.nlp.morpho.MorphoFeatures;
import org.apache.stanbol.enhancer.nlp.morpho.NumberFeature;
import org.apache.stanbol.enhancer.nlp.morpho.NumberTag;
import org.apache.stanbol.enhancer.nlp.morpho.Person;
import org.apache.stanbol.enhancer.nlp.morpho.Tense;
import org.apache.stanbol.enhancer.nlp.morpho.TenseTag;
import org.apache.stanbol.enhancer.nlp.morpho.VerbMood;
import org.apache.stanbol.enhancer.nlp.morpho.VerbMoodTag;
import org.apache.stanbol.enhancer.nlp.ner.NerTag;
import org.apache.stanbol.enhancer.nlp.phrase.PhraseTag;
import org.apache.stanbol.enhancer.nlp.pos.LexicalCategory;
import org.apache.stanbol.enhancer.nlp.pos.Pos;
import org.apache.stanbol.enhancer.nlp.pos.PosTag;
import org.apache.stanbol.enhancer.servicesapi.Blob;
import org.apache.stanbol.enhancer.servicesapi.ContentItem;
import org.apache.stanbol.enhancer.servicesapi.ContentItemFactory;
import org.apache.stanbol.enhancer.servicesapi.helper.ContentItemHelper;
import org.apache.stanbol.enhancer.servicesapi.impl.StringSource;
import org.junit.Assert;
import org.junit.Before;
import org.junit.BeforeClass;
import org.junit.Test;
import org.slf4j.Logger;
import org.slf4j.LoggerFactory;


public class AnalyzedTextSerializerAndParserTest {


    private static Logger log = LoggerFactory.getLogger(AnalyzedTextSerializerAndParserTest.class);


    public static final String text = "The Stanbol enhancer can detect famous " +
            "cities such as Paris and people such as Bob Marley.";
    
    public static final Annotation<Number> testAnnotation = 
            new Annotation<Number>("test", Number.class);
    
    /* -----
     * Test data creates within the BeforeClass
     * -----
     */
    /**
     * AnalysedText instance filled in {@link #setup()} with test dats
     */
    private static AnalysedText analysedTextWithData;
    private static LinkedHashMap<Sentence,String> expectedSentences = new LinkedHashMap<Sentence,String>();
    private static LinkedHashMap<Chunk,String> expectedChunks = new LinkedHashMap<Chunk,String>();
    private static LinkedHashMap<Token,String> expectedTokens = new LinkedHashMap<Token,String>();
    
    /* -----
     * Test data creates before every single test
     * -----
     */
    /**
     * Empty AnalysedText instance created before each test
     */
    private static AnalysedText at;


    private static final ContentItemFactory ciFactory = InMemoryContentItemFactory.getInstance();
    private static final AnalysedTextFactory atFactory = AnalysedTextFactory.getDefaultInstance();
    
    private static ContentItem ci;


    private static Entry<UriRef,Blob> textBlob;
    
    @BeforeClass
    public static final void setup() throws IOException {
        ci = ciFactory.createContentItem(new StringSource(text));
        textBlob = ContentItemHelper.getBlob(ci, Collections.singleton("text/plain"));
        analysedTextWithData = createAnalysedText();
        int sentence = text.indexOf('.')+1;
        Sentence sent1 = analysedTextWithData.addSentence(0, sentence);
        expectedSentences.put(sent1, "The Stanbol enhancer can detect famous " +
            "cities such as Paris and people such as Bob Marley.");
        
        Token the = sent1.addToken(0, 3);
        expectedTokens.put(the, "The");
        the.addAnnotation(NlpAnnotations.POS_ANNOTATION, Value.value(
            new PosTag("PREP",Pos.Preposition), 0.85));
        
        Token stanbol = sent1.addToken(4,11);
        expectedTokens.put(stanbol, "Stanbol");
        stanbol.addAnnotation(NlpAnnotations.POS_ANNOTATION, Value.value(
            new PosTag("PN", Pos.ProperNoun),0.95));
        stanbol.addAnnotation(NlpAnnotations.SENTIMENT_ANNOTATION, Value.value(
            0.5));
        
        //use index to create Tokens
        int enhancerStart = sent1.getSpan().toString().indexOf("enhancer");
        Token enhancer = sent1.addToken(enhancerStart,enhancerStart+"enhancer".length());
        expectedTokens.put(enhancer, "enhancer");
        enhancer.addAnnotation(NlpAnnotations.POS_ANNOTATION, Value.value(
            new PosTag("PN", Pos.ProperNoun),0.95));
        enhancer.addAnnotation(NlpAnnotations.POS_ANNOTATION, Value.value(
            new PosTag("N", LexicalCategory.Noun),0.87));
        MorphoFeatures morpho = new MorphoFeatures("enhance");
        morpho.addCase(new CaseTag("test-case-1",Case.Comitative));
        morpho.addCase(new CaseTag("test-case-2",Case.Abessive));
        morpho.addDefinitness(Definitness.Definite);
        morpho.addPerson(Person.First);
        morpho.addPos(new PosTag("PN", Pos.ProperNoun));
        morpho.addGender(new GenderTag("test-gender", Gender.Masculine));
        morpho.addNumber(new NumberTag("test-number", NumberFeature.Plural));
        morpho.addTense(new TenseTag("test-tense", Tense.Present));
        morpho.addVerbForm(new VerbMoodTag("test-verb-mood", VerbMood.ConditionalVerb));
        enhancer.addAnnotation(NlpAnnotations.MORPHO_ANNOTATION, Value.value(morpho));


        //create a chunk
        Chunk stanbolEnhancer = analysedTextWithData.addChunk(stanbol.getStart(), enhancer.getEnd());
        expectedChunks.put(stanbolEnhancer, "Stanbol enhancer");
        stanbolEnhancer.addAnnotation(NlpAnnotations.NER_ANNOTATION, Value.value(
            new NerTag("organization", DBPEDIA_ORGANISATION)));
        stanbolEnhancer.addAnnotation(NlpAnnotations.PHRASE_ANNOTATION, Value.value(
            new PhraseTag("NP", LexicalCategory.Noun),0.98));


    }
    @Before
    public void initAnalysedText() throws Exception {
        at = createAnalysedText();
    }
    /**
     * @throws IOException
     */
    private static AnalysedText createAnalysedText() throws IOException {
        return  atFactory.createAnalysedText(textBlob.getValue());
    }
    
    @Test
    public void testSerialization() throws IOException {
        ByteArrayOutputStream bout = new ByteArrayOutputStream();
        AnalyzedTextSerializer serializer = AnalyzedTextSerializer.getDefaultInstance();
        serializer.serialize(analysedTextWithData, bout, null);
        //get the serialized String and check for some expected elements
        byte[] data = bout.toByteArray();
        String serialized = new String(data,Charset.forName("UTF-8"));
        Assert.assertTrue(serialized.contains("\"spans\" : [ {"));
        Assert.assertTrue(serialized.contains("\"type\" : \"Text\""));
        Assert.assertTrue(serialized.contains("\"type\" : \"Sentence\""));
        Assert.assertTrue(serialized.contains("\"type\" : \"Token\""));
        Assert.assertTrue(serialized.contains("\"stanbol.enhancer.nlp.pos\" : {"));
        Assert.assertTrue(serialized.contains("\"class\" : \"org.apache.stanbol.enhancer.nlp.pos.PosTag\""));
        Assert.assertTrue(serialized.contains("\"stanbol.enhancer.nlp.ner\" : {"));
        Assert.assertTrue(serialized.contains("\"class\" : \"org.apache.stanbol.enhancer.nlp.ner.NerTag\""));
        Assert.assertTrue(serialized.contains("\"stanbol.enhancer.nlp.morpho\" : {"));
        Assert.assertTrue(serialized.contains("\"class\" : \"org.apache.stanbol.enhancer.nlp.morpho.MorphoFeatures\""));
        //deserialize
        AnalyzedTextParser parser = AnalyzedTextParser.getDefaultInstance();
        AnalysedText parsedAt = parser.parse(new ByteArrayInputStream(data), null, 
            atFactory.createAnalysedText(textBlob.getValue()));
        Assert.assertEquals(analysedTextWithData, parsedAt);
        Iterator<Span> origSpanIt = analysedTextWithData.getEnclosed(EnumSet.allOf(SpanTypeEnum.class));
        Iterator<Span> parsedSpanIt = parsedAt.getEnclosed(EnumSet.allOf(SpanTypeEnum.class));
        while(origSpanIt.hasNext() && parsedSpanIt.hasNext()){
            Span orig = origSpanIt.next();
            Span parsed = parsedSpanIt.next();
            Assert.assertEquals(orig, parsed);
            Set<String> origKeys = orig.getKeys();
            Set<String> parsedKeys = parsed.getKeys();
            Assert.assertEquals(origKeys, parsedKeys);
            for(String key : origKeys){
                List<Value<?>> origValues = orig.getValues(key);
                List<Value<?>> parsedValues = parsed.getValues(key);
                Assert.assertEquals(origValues, parsedValues);
            }
        }
        Assert.assertFalse("Original AnalyzedText MUST NOT have additional Spans",origSpanIt.hasNext());
        Assert.assertFalse("Parsed AnalyzedText MUST NOT have additional Spans",parsedSpanIt.hasNext());
    }
    
}
Source Code of org.apache.stanbol.enhancer.nlp.json.AnalyzedTextSerializerAndParserTest

Related Classes of org.apache.stanbol.enhancer.nlp.json.AnalyzedTextSerializerAndParserTest