Package org.carrot2.text.preprocessing.pipeline

Source Code of org.carrot2.text.preprocessing.pipeline.CompletePreprocessingPipeline

/*
* Carrot2 project.
*
* Copyright (C) 2002-2014, Dawid Weiss, Stanisław Osiński.
* All rights reserved.
*
* Refer to the full license file "carrot2.LICENSE"
* in the root folder of the repository checkout or at:
* http://www.carrot2.org/carrot2.LICENSE
*/

package org.carrot2.text.preprocessing.pipeline;

import java.util.List;

import org.carrot2.core.Document;
import org.carrot2.core.LanguageCode;
import org.carrot2.text.linguistic.LanguageModel;
import org.carrot2.text.preprocessing.CaseNormalizer;
import org.carrot2.text.preprocessing.DocumentAssigner;
import org.carrot2.text.preprocessing.LabelFilterProcessor;
import org.carrot2.text.preprocessing.LanguageModelStemmer;
import org.carrot2.text.preprocessing.PhraseExtractor;
import org.carrot2.text.preprocessing.PreprocessingContext;
import org.carrot2.text.preprocessing.StopListMarker;
import org.carrot2.text.preprocessing.Tokenizer;
import org.carrot2.util.attribute.Bindable;

/**
* Performs a complete preprocessing on the provided documents. The preprocessing consists
* of the following steps:
* <ol>
* <li>{@link Tokenizer#tokenize(PreprocessingContext)}</li>
* <li>{@link CaseNormalizer#normalize(PreprocessingContext)}</li>
* <li>{@link LanguageModelStemmer#stem(PreprocessingContext)}</li>
* <li>{@link StopListMarker#mark(PreprocessingContext)}</li>
* <li>{@link PhraseExtractor#extractPhrases(PreprocessingContext)}</li>
* <li>{@link LabelFilterProcessor#process(PreprocessingContext)}</li>
* <li>{@link DocumentAssigner#assign(PreprocessingContext)}</li>
* </ol>
*/
@Bindable(prefix = "PreprocessingPipeline")
public class CompletePreprocessingPipeline extends BasicPreprocessingPipeline
{
    /**
     * Phrase extractor used by the algorithm, contains bindable attributes.
     */
    public final PhraseExtractor phraseExtractor = new PhraseExtractor();

    /**
     * Label filter processor used by the algorithm, contains bindable attributes.
     */
    public final LabelFilterProcessor labelFilterProcessor = new LabelFilterProcessor();

    /**
     * Document assigner used by the algorithm, contains bindable attributes.
     */
    public final DocumentAssigner documentAssigner = new DocumentAssigner();

    @Override
    public PreprocessingContext preprocess(List<Document> documents, String query,
        LanguageCode language)
    {
        final PreprocessingContext context = new PreprocessingContext(
            LanguageModel.create(language, stemmerFactory, tokenizerFactory,
                lexicalDataFactory), documents, query);

        tokenizer.tokenize(context);
        caseNormalizer.normalize(context);
        languageModelStemmer.stem(context);
        stopListMarker.mark(context);
        phraseExtractor.extractPhrases(context);
        labelFilterProcessor.process(context);
        documentAssigner.assign(context);

        context.preprocessingFinished();
        return context;

    }
}
TOP

Related Classes of org.carrot2.text.preprocessing.pipeline.CompletePreprocessingPipeline

TOP
Copyright © 2018 www.massapi.com. All rights reserved.
All source code are property of their respective owners. Java is a trademark of Sun Microsystems, Inc and owned by ORACLE Inc. Contact coftware#gmail.com.