Package dkpro.similarity.algorithms.sspace

Source Code of dkpro.similarity.algorithms.sspace.SSpaceVectorReader

/*******************************************************************************
* Copyright 2012
* Ubiquitous Knowledge Processing (UKP) Lab
* Technische Universität Darmstadt
*
* All rights reserved. This program and the accompanying materials
* are made available under the terms of the GNU Public License v3.0
* which accompanies this distribution, and is available at
* http://www.gnu.org/licenses/gpl-3.0.txt
*******************************************************************************/
package dkpro.similarity.algorithms.sspace;

import static java.util.Arrays.asList;

import java.io.BufferedReader;
import java.io.File;
import java.io.FileReader;
import java.io.IOException;
import java.util.Collection;
import java.util.Properties;

import no.uib.cipr.matrix.Vector;
import no.uib.cipr.matrix.sparse.SparseVector;

import org.apache.commons.io.FileUtils;

import dkpro.similarity.algorithms.api.SimilarityException;
import dkpro.similarity.algorithms.sspace.util.DocumentVectorBuilder;
import dkpro.similarity.algorithms.sspace.util.LatentSemanticAnalysis;
import dkpro.similarity.algorithms.sspace.util.VectorAdapter;
import dkpro.similarity.algorithms.vsm.store.VectorReader;
import edu.ucla.sspace.common.SemanticSpace;

/**
* Vector reader accessing a {@link SemanticSpace}.
*
* @author Richard Eckart de Castilho
*/
public class SSpaceVectorReader
  extends VectorReader
{
  private final SemanticSpace sspace;
  private final DocumentVectorBuilder builder;

  /**
   * Create a vector source for an existing semantic space. To build such a space, you best
   * use the LsiIndexWriter from DKPro IR.
   *
   * @parameter dimensions An existing semantic space (usually an *.sspace file)
   */
  public SSpaceVectorReader(SemanticSpace aSSpace)
  {
    sspace = aSSpace;
    builder = new DocumentVectorBuilder(aSSpace);
  }

  /**
   * Creates a new semantic space from scratch based on all "txt" documents in the specified path.
   * If you want to have full control over the tokenization, normalization, stop word removal
   * and such, you should build the semantic space yourself and then use the other constructor.
   *
   * @parameter directory Path to the document collection
   * @see {@link #createSemanticSpace}
   */
  public SSpaceVectorReader(File aDirectory)
    throws IOException
  {
    this(createSemanticSpace(aDirectory, -1));
  }

  @Override
  public Vector getVector(String aTerm)
    throws SimilarityException
  {
    Vector vec1 = new SparseVector(getConceptCount());
    builder.buildVector(asList(aTerm), VectorAdapter.create(vec1));
    return vec1;
  }

  @Override
  public int getConceptCount()
    throws SimilarityException
  {
    // Nasty stack-overflow bug in getVectorLength()!
    return sspace.getVector(sspace.getWords().iterator().next()).length();
  }

  @Override
  public String getId()
  {
    return sspace.getSpaceName();
  }

  @Override
  public void close()
  {
    // Nothing to do
  }

  /**
   * Create a LSA space from all "txt" files in the specified directory.
   *
   * @param aInputDir directory containing the input files.
   * @param aMaxDimensions maximum number of dimensions. If 0 or negative, 300 is used, which has
   * been determined as a good default for LSA models. If the directory contains less than 300
   * documents, the number of documents is used as the number of dimensions.
   * @return a semantic space.
   * @throws IOException
   */
  public static SemanticSpace createSemanticSpace(File aInputDir, int aMaxDimensions)
    throws IOException
  {
    LatentSemanticAnalysis sspace = new LatentSemanticAnalysis();

    Collection<File> documents = FileUtils.listFiles(aInputDir, new String[] { "txt" }, true);

    for (File document : documents) {
      BufferedReader reader = new BufferedReader(new FileReader(document));
      sspace.processDocument(reader);
    }

    int dimensions = Math.min(documents.size(), aMaxDimensions <= 0 ? 300 : aMaxDimensions);

    Properties props = new Properties();
    props.setProperty(LatentSemanticAnalysis.LSA_DIMENSIONS_PROPERTY, Integer.toString(dimensions));
    sspace.processSpace(props);

    return sspace;
  }
}
TOP

Related Classes of dkpro.similarity.algorithms.sspace.SSpaceVectorReader

TOP
Copyright © 2018 www.massapi.com. All rights reserved.
All source code are property of their respective owners. Java is a trademark of Sun Microsystems, Inc and owned by ORACLE Inc. Contact coftware#gmail.com.