Package org.carrot2.text.preprocessing

Source Code of org.carrot2.text.preprocessing.StopListMarker

/*
* Carrot2 project.
*
* Copyright (C) 2002-2014, Dawid Weiss, Stanisław Osiński.
* All rights reserved.
*
* Refer to the full license file "carrot2.LICENSE"
* in the root folder of the repository checkout or at:
* http://www.carrot2.org/carrot2.LICENSE
*/

package org.carrot2.text.preprocessing;

import org.carrot2.text.analysis.ITokenizer;
import org.carrot2.text.linguistic.ILexicalData;
import org.carrot2.text.preprocessing.PreprocessingContext.AllWords;
import org.carrot2.text.util.MutableCharArray;
import org.carrot2.util.CharArrayUtils;
import org.carrot2.util.attribute.Bindable;

/**
* Marks stop words based on the current language model.
* <p>
* This class saves the following results to the {@link PreprocessingContext}:
* <ul>
* <li>{@link AllWords#type}</li>
* </ul>
* <p>
* This class requires that {@link Tokenizer} and {@link CaseNormalizer} be invoked first.
*/
@Bindable(prefix = "StopListMarker")
public final class StopListMarker
{
    /**
     * Marks stop words and saves the results to the <code>context</code>.
     */
    public void mark(PreprocessingContext context)
    {
        final char [][] wordImages = context.allWords.image;
        final short [] types = context.allWords.type;

        final MutableCharArray mutableCharArray = new MutableCharArray("");
        char [] buffer = new char [128];
        final ILexicalData lexData = context.language.getLexicalData();

        for (int i = 0; i < wordImages.length; i++)
        {
            final char [] word = wordImages[i];
            if (buffer.length < word.length) buffer = new char [word.length];

            CharArrayUtils.toLowerCase(word, buffer);
            mutableCharArray.reset(buffer, 0, word.length);
            if (lexData.isCommonWord(mutableCharArray))
            {
                types[i] |= ITokenizer.TF_COMMON_WORD;
            }
        }
    }
}
TOP

Related Classes of org.carrot2.text.preprocessing.StopListMarker

TOP
Copyright © 2018 www.massapi.com. All rights reserved.
All source code are property of their respective owners. Java is a trademark of Sun Microsystems, Inc and owned by ORACLE Inc. Contact coftware#gmail.com.