/*
* Carrot2 project.
*
* Copyright (C) 2002-2014, Dawid Weiss, Stanisław Osiński.
* All rights reserved.
*
* Refer to the full license file "carrot2.LICENSE"
* in the root folder of the repository checkout or at:
* http://www.carrot2.org/carrot2.LICENSE
*/
package org.carrot2.text.preprocessing;
import org.carrot2.text.analysis.ITokenizer;
import org.carrot2.text.linguistic.ILexicalData;
import org.carrot2.text.preprocessing.PreprocessingContext.AllWords;
import org.carrot2.text.util.MutableCharArray;
import org.carrot2.util.CharArrayUtils;
import org.carrot2.util.attribute.Bindable;
/**
* Marks stop words based on the current language model.
* <p>
* This class saves the following results to the {@link PreprocessingContext}:
* <ul>
* <li>{@link AllWords#type}</li>
* </ul>
* <p>
* This class requires that {@link Tokenizer} and {@link CaseNormalizer} be invoked first.
*/
@Bindable(prefix = "StopListMarker")
public final class StopListMarker
{
/**
* Marks stop words and saves the results to the <code>context</code>.
*/
public void mark(PreprocessingContext context)
{
final char [][] wordImages = context.allWords.image;
final short [] types = context.allWords.type;
final MutableCharArray mutableCharArray = new MutableCharArray("");
char [] buffer = new char [128];
final ILexicalData lexData = context.language.getLexicalData();
for (int i = 0; i < wordImages.length; i++)
{
final char [] word = wordImages[i];
if (buffer.length < word.length) buffer = new char [word.length];
CharArrayUtils.toLowerCase(word, buffer);
mutableCharArray.reset(buffer, 0, word.length);
if (lexData.isCommonWord(mutableCharArray))
{
types[i] |= ITokenizer.TF_COMMON_WORD;
}
}
}
}