/*
* Carrot2 project.
*
* Copyright (C) 2002-2014, Dawid Weiss, Stanisław Osiński.
* All rights reserved.
*
* Refer to the full license file "carrot2.LICENSE"
* in the root folder of the repository checkout or at:
* http://www.carrot2.org/carrot2.LICENSE
*/
package org.carrot2.text.preprocessing;
import org.carrot2.core.LanguageCode;
import org.carrot2.text.analysis.ExtendedWhitespaceTokenizer;
import org.carrot2.text.analysis.ITokenizer;
import org.carrot2.text.linguistic.ILexicalData;
import org.carrot2.text.linguistic.ILexicalDataFactory;
import org.carrot2.text.linguistic.IStemmer;
import org.carrot2.text.linguistic.IStemmerFactory;
import org.carrot2.text.linguistic.ITokenizerFactory;
import org.carrot2.text.util.MutableCharArray;
public final class TestLanguageModelFactory implements IStemmerFactory,
ITokenizerFactory, ILexicalDataFactory
{
@Override
public IStemmer getStemmer(LanguageCode language)
{
return new IStemmer()
{
public CharSequence stem(CharSequence word)
{
if (word.length() > 2)
{
return word.subSequence(0, word.length() - 2);
}
else
{
return null;
}
}
};
}
@Override
public ITokenizer getTokenizer(LanguageCode language)
{
return new ExtendedWhitespaceTokenizer();
}
@Override
public ILexicalData getLexicalData(LanguageCode language)
{
return new ILexicalData()
{
public boolean isCommonWord(MutableCharArray word)
{
return word.toString().contains("stop");
}
public boolean isStopLabel(CharSequence formattedLabel)
{
return formattedLabel.toString().startsWith("stoplabel");
}
};
}
}