Package org.codemap.tasks

Source Code of org.codemap.tasks.ComputeIndexTask

package org.codemap.tasks;

import java.io.FileInputStream;
import java.io.FileNotFoundException;
import java.util.Collection;

import org.codemap.util.Log;

import ch.akuhn.hapax.CorpusBuilder;
import ch.akuhn.hapax.Hapax;
import ch.akuhn.hapax.index.LatentSemanticIndex;
import ch.akuhn.util.ProgressMonitor;
import ch.akuhn.values.Arguments;
import ch.akuhn.values.TaskValue;
import ch.akuhn.values.Value;

public class ComputeIndexTask extends TaskValue<LatentSemanticIndex> {

    public ComputeIndexTask(Value<Collection<String>> elements) {
        super("Creating latent semantic index", elements);
    }

    @Override
    protected LatentSemanticIndex computeValue(ProgressMonitor monitor, Arguments args) {
        Collection<String> elements = args.nextOrFail();
        monitor.begin(elements.size());
        CorpusBuilder builder = Hapax.newCorpus()
            .ignoreCase()
            .useCamelCaseScanner()
            .rejectRareTerms()
            .rejectStopwords()
            .latentDimensions(25)
            .useTFIDF();
        for (String path: elements) {
            parseElement(builder, path);
            monitor.worked(1);
        }
        return builder.makeTDM().createIndex();
    }

    protected void parseElement(CorpusBuilder builder, String path) {
        try {
            builder.addDocument(path, new FileInputStream(path));
        } catch (FileNotFoundException e) {
            Log.error(e);
        }
    }
}
TOP

Related Classes of org.codemap.tasks.ComputeIndexTask

TOP
Copyright © 2018 www.massapi.com. All rights reserved.
All source code are property of their respective owners. Java is a trademark of Sun Microsystems, Inc and owned by ORACLE Inc. Contact coftware#gmail.com.