Package org.carrot2.text.linguistic

Source Code of org.carrot2.text.linguistic.TokenizerTestBase

/*
* Carrot2 project.
*
* Copyright (C) 2002-2014, Dawid Weiss, Stanisław Osiński.
* All rights reserved.
*
* Refer to the full license file "carrot2.LICENSE"
* in the root folder of the repository checkout or at:
* http://www.carrot2.org/carrot2.LICENSE
*/

package org.carrot2.text.linguistic;

import java.io.IOException;
import java.io.StringReader;
import java.util.ArrayList;

import org.carrot2.text.analysis.ITokenizer;
import org.carrot2.text.util.MutableCharArray;
import org.carrot2.util.tests.CarrotTestCase;
import org.fest.assertions.Assertions;

/**
* A base class for testing Carrot2 tokenizers.
*/
abstract class TokenizerTestBase extends CarrotTestCase
{
    /**
     * Creates the Analyzer under tests.
     */
    protected abstract ITokenizer createTokenStream() throws IOException;

    /**
     * Internal class for comparing sequences of tokens.
     */
    protected static class TokenImage
    {
        final int type;
        final String image;

        public TokenImage(String image, int type)
        {
            this.type = type;
            this.image = image;
        }

        @Override
        public boolean equals(Object o)
        {
            if (o instanceof TokenImage)
            {
                return (((TokenImage) o).image.equals(this.image) && (((TokenImage) o).type == this.type));
            }
            else
            {
                return false;
            }
        }

        @Override
        public int hashCode()
        {
            return image != null ? image.hashCode() ^ type : type;
        }

        public String toString()
        {
            final String rawType = "0x" + Integer.toHexString(type);
            return "[" + rawType + "] " + this.image;
        }
    }

    /**
     * Compare expected and produced token sequences.
     */
    protected void assertEqualTokens(String testString, TokenImage [] expectedTokens)
    {
        try
        {
            final ITokenizer tokenStream = createTokenStream();
            tokenStream.reset(new StringReader(testString));

            final ArrayList<TokenImage> tokens = new ArrayList<TokenImage>();
            short token;
            MutableCharArray buffer = new MutableCharArray();
            while ((token = tokenStream.nextToken()) >= 0)
            {
                tokenStream.setTermBuffer(buffer);
                tokens.add(new TokenImage(buffer.toString(), token));
            }

            for (int i = 0; i < tokens.size(); i++) {
            }       

            Assertions
                .assertThat(tokens)
                .containsExactly((Object[]) expectedTokens);
        }
        catch (IOException e)
        {
            throw new RuntimeException(e);
        }
    }

    protected TokenImage term(String image)
    {
        return new TokenImage(image, ITokenizer.TT_TERM);
    }

    protected TokenImage punctuation(String image)
    {
        return new TokenImage(image, ITokenizer.TT_PUNCTUATION);
    }

    protected TokenImage sentenceDelimiter(String image)
    {
        return new TokenImage(image, ITokenizer.TT_PUNCTUATION | ITokenizer.TF_SEPARATOR_SENTENCE);
    }

    protected TokenImage numeric(String image)
    {
        return new TokenImage(image, ITokenizer.TT_NUMERIC);
    }

    protected TokenImage [] tokens(int type, String... images)
    {
        final TokenImage [] result = new TokenImage [images.length];

        for (int i = 0; i < images.length; i++)
        {
            result[i] = new TokenImage(images[i], type);
        }

        return result;
    }
}
TOP

Related Classes of org.carrot2.text.linguistic.TokenizerTestBase

TOP
Copyright © 2018 www.massapi.com. All rights reserved.
All source code are property of their respective owners. Java is a trademark of Sun Microsystems, Inc and owned by ORACLE Inc. Contact coftware#gmail.com.