Source Code of org.carrot2.text.linguistic.TokenizerTestBase


/*
 * Carrot2 project.
 *
 * Copyright (C) 2002-2014, Dawid Weiss, Stanisław Osiński.
 * All rights reserved.
 *
 * Refer to the full license file "carrot2.LICENSE"
 * in the root folder of the repository checkout or at:
 * http://www.carrot2.org/carrot2.LICENSE
 */


package org.carrot2.text.linguistic;


import java.io.IOException;
import java.io.StringReader;
import java.util.ArrayList;


import org.carrot2.text.analysis.ITokenizer;
import org.carrot2.text.util.MutableCharArray;
import org.carrot2.util.tests.CarrotTestCase;
import org.fest.assertions.Assertions;


/**
 * A base class for testing Carrot2 tokenizers.
 */
abstract class TokenizerTestBase extends CarrotTestCase
{
    /**
     * Creates the Analyzer under tests.
     */
    protected abstract ITokenizer createTokenStream() throws IOException;


    /**
     * Internal class for comparing sequences of tokens.
     */
    protected static class TokenImage
    {
        final int type;
        final String image;


        public TokenImage(String image, int type)
        {
            this.type = type;
            this.image = image;
        }


        @Override
        public boolean equals(Object o)
        {
            if (o instanceof TokenImage)
            {
                return (((TokenImage) o).image.equals(this.image) && (((TokenImage) o).type == this.type));
            }
            else
            {
                return false;
            }
        }


        @Override
        public int hashCode()
        {
            return image != null ? image.hashCode() ^ type : type;
        }


        public String toString()
        {
            final String rawType = "0x" + Integer.toHexString(type);
            return "[" + rawType + "] " + this.image;
        }
    }


    /**
     * Compare expected and produced token sequences.
     */
    protected void assertEqualTokens(String testString, TokenImage [] expectedTokens)
    {
        try
        {
            final ITokenizer tokenStream = createTokenStream();
            tokenStream.reset(new StringReader(testString));


            final ArrayList<TokenImage> tokens = new ArrayList<TokenImage>();
            short token;
            MutableCharArray buffer = new MutableCharArray();
            while ((token = tokenStream.nextToken()) >= 0)
            {
                tokenStream.setTermBuffer(buffer);
                tokens.add(new TokenImage(buffer.toString(), token));
            }


            for (int i = 0; i < tokens.size(); i++) {
            }        


            Assertions
                .assertThat(tokens)
                .containsExactly((Object[]) expectedTokens);
        }
        catch (IOException e)
        {
            throw new RuntimeException(e);
        }
    }


    protected TokenImage term(String image)
    {
        return new TokenImage(image, ITokenizer.TT_TERM);
    }


    protected TokenImage punctuation(String image)
    {
        return new TokenImage(image, ITokenizer.TT_PUNCTUATION);
    }


    protected TokenImage sentenceDelimiter(String image)
    {
        return new TokenImage(image, ITokenizer.TT_PUNCTUATION | ITokenizer.TF_SEPARATOR_SENTENCE);
    }


    protected TokenImage numeric(String image)
    {
        return new TokenImage(image, ITokenizer.TT_NUMERIC);
    }


    protected TokenImage [] tokens(int type, String... images)
    {
        final TokenImage [] result = new TokenImage [images.length];


        for (int i = 0; i < images.length; i++)
        {
            result[i] = new TokenImage(images[i], type);
        }


        return result;
    }
}
Source Code of org.carrot2.text.linguistic.TokenizerTestBase

Related Classes of org.carrot2.text.linguistic.TokenizerTestBase