Source Code of org.carrot2.text.preprocessing.StopListMarker


/*
 * Carrot2 project.
 *
 * Copyright (C) 2002-2014, Dawid Weiss, Stanisław Osiński.
 * All rights reserved.
 *
 * Refer to the full license file "carrot2.LICENSE"
 * in the root folder of the repository checkout or at:
 * http://www.carrot2.org/carrot2.LICENSE
 */


package org.carrot2.text.preprocessing;


import org.carrot2.text.analysis.ITokenizer;
import org.carrot2.text.linguistic.ILexicalData;
import org.carrot2.text.preprocessing.PreprocessingContext.AllWords;
import org.carrot2.text.util.MutableCharArray;
import org.carrot2.util.CharArrayUtils;
import org.carrot2.util.attribute.Bindable;


/**
 * Marks stop words based on the current language model.
 * <p>
 * This class saves the following results to the {@link PreprocessingContext}:
 * <ul>
 * <li>{@link AllWords#type}</li>
 * </ul>
 * <p>
 * This class requires that {@link Tokenizer} and {@link CaseNormalizer} be invoked first.
 */
@Bindable(prefix = "StopListMarker")
public final class StopListMarker
{
    /**
     * Marks stop words and saves the results to the <code>context</code>.
     */
    public void mark(PreprocessingContext context)
    {
        final char [][] wordImages = context.allWords.image;
        final short [] types = context.allWords.type;


        final MutableCharArray mutableCharArray = new MutableCharArray("");
        char [] buffer = new char [128];
        final ILexicalData lexData = context.language.getLexicalData();


        for (int i = 0; i < wordImages.length; i++)
        {
            final char [] word = wordImages[i];
            if (buffer.length < word.length) buffer = new char [word.length];


            CharArrayUtils.toLowerCase(word, buffer);
            mutableCharArray.reset(buffer, 0, word.length);
            if (lexData.isCommonWord(mutableCharArray))
            {
                types[i] |= ITokenizer.TF_COMMON_WORD;
            }
        }
    }
}
Source Code of org.carrot2.text.preprocessing.StopListMarker

Related Classes of org.carrot2.text.preprocessing.StopListMarker