Package dkpro.similarity.algorithms.sspace.util

Source Code of dkpro.similarity.algorithms.sspace.util.LsaIndexer

package dkpro.similarity.algorithms.sspace.util;

import java.io.File;
import java.io.IOException;
import java.util.ArrayList;
import java.util.List;
import java.util.Map.Entry;
import java.util.Properties;

import org.apache.uima.UimaContext;
import org.apache.uima.analysis_engine.AnalysisEngineProcessException;
import org.apache.uima.cas.text.AnnotationFS;
import org.apache.uima.fit.component.JCasAnnotator_ImplBase;
import org.apache.uima.fit.descriptor.ConfigurationParameter;
import org.apache.uima.jcas.JCas;
import org.apache.uima.resource.ResourceInitializationException;

import de.tudarmstadt.ukp.dkpro.core.api.featurepath.FeaturePathException;
import de.tudarmstadt.ukp.dkpro.core.api.featurepath.FeaturePathFactory;
import dkpro.similarity.algorithms.sspace.util.LatentSemanticAnalysis;
import edu.ucla.sspace.common.SemanticSpaceIO;

/**
* Creates a semantic space for LSA.
*
*/
public class LsaIndexer
    extends JCasAnnotator_ImplBase
{

  /**
   * Path to the directory where the semantic space will be stored.
   */
  public static final String PARAM_INDEX_PATH = "IndexPath";
  @ConfigurationParameter(name = PARAM_INDEX_PATH, mandatory = true)
  private File indexPath;
 
  /**
     * The maximum number of dimensions in the semantic space.
     */
    public static final String PARAM_MAX_DIMENSIONS = "maxDimensions";
    @ConfigurationParameter(name = PARAM_MAX_DIMENSIONS, mandatory = true, defaultValue = "300")
    private int maxDimensions;
   
    /**
     * This annotator is type agnostic, so it is mandatory to specify the type of the working
     * annotation and how to obtain the string representation with the feature path.
     */
    public static final String PARAM_FEATURE_PATH = "featurePath";
    @ConfigurationParameter(name = PARAM_FEATURE_PATH, mandatory = true)
    private String featurePath;

    private LatentSemanticAnalysis sspace;
   
    private int nrOfDocuments;

  @Override
  public void initialize(UimaContext context)
          throws ResourceInitializationException
  {
    super.initialize(context);

    nrOfDocuments = 0;
   
        try {
            sspace = new LatentSemanticAnalysis();
        }
        catch (IOException e) {
            throw new ResourceInitializationException(e);
        }
  }

  @Override
  public void process(JCas jCas) throws AnalysisEngineProcessException {
      nrOfDocuments++;
     
    final List<String> terms = new ArrayList<String>();
        try {
            for (Entry<AnnotationFS, String> entry : FeaturePathFactory.select(jCas.getCas(),
                    featurePath))
            {
//                System.out.println(entry.getKey());
//                System.out.println(entry.getValue());
                terms.add(entry.getValue());
            }
           
            sspace.processDocument(terms);
        }
        catch (FeaturePathException e) {
            throw new AnalysisEngineProcessException(e);
        }
  }

  @Override
  public void collectionProcessComplete()
          throws AnalysisEngineProcessException
  {
    super.collectionProcessComplete();
   
        int dimensions = Math.min(nrOfDocuments, maxDimensions);

        Properties props = new Properties();
        props.setProperty(LatentSemanticAnalysis.LSA_DIMENSIONS_PROPERTY, Integer.toString(dimensions));
        sspace.processSpace(props);

        // serialize to disk
        try {
            indexPath.mkdirs();
            SemanticSpaceIO.save(sspace, new File(indexPath, "test.sspace"));
        }
        catch (IOException e) {
            throw new AnalysisEngineProcessException(e);
       
  }
}
TOP

Related Classes of dkpro.similarity.algorithms.sspace.util.LsaIndexer

TOP
Copyright © 2018 www.massapi.com. All rights reserved.
All source code are property of their respective owners. Java is a trademark of Sun Microsystems, Inc and owned by ORACLE Inc. Contact coftware#gmail.com.