Package org.apache.mahout.math.hadoop

Examples of org.apache.mahout.math.hadoop.DistributedRowMatrix


                 boolean isSymmetric,
                 int desiredRank) throws Exception {
    Matrix eigenVectors = new DenseMatrix(desiredRank, numCols);
    List<Double> eigenValues = new ArrayList<Double>();

    DistributedRowMatrix matrix = new DistributedRowMatrix(inputPath, outputTmpPath, numRows, numCols);
    matrix.configure(new JobConf(getConf() != null ? getConf() : new Configuration()));
    solve(matrix, desiredRank, eigenVectors, eigenValues, isSymmetric);

    Path outputEigenVectorPath = new Path(outputPath, RAW_EIGENVECTORS);
    serializeOutput(eigenVectors, eigenValues, outputEigenVectorPath);
    return 0;
View Full Code Here


public final class TestDistributedLanczosSolverCLI extends MahoutTestCase {

  @Test
  public void testDistributedLanczosSolverCLI() throws Exception {
    Path testData = getTestTempDirPath("testdata");
    DistributedRowMatrix corpus =
        new TestDistributedRowMatrix().randomDistributedMatrix(500, 450, 500, 10, 10.0, true, testData.toString());
    corpus.configure(new JobConf());
    Path output = getTestTempDirPath("output");
    Path tmp = getTestTempDirPath("tmp");
    String[] args = {
        "-i", new Path(testData, "distMatrix").toString(),
        "-o", output.toString(),
        "--tempDir", tmp.toString(), "--numRows", "500",
        "--numCols", "500",
        "--rank", "10",
        "--symmetric", "true"
    };
    new DistributedLanczosSolver().new DistributedLanczosSolverJob().run(args);

    Path rawEigenvectors = new Path(output, DistributedLanczosSolver.RAW_EIGENVECTORS);
    Matrix eigenVectors = new DenseMatrix(10, corpus.numCols());
    Configuration conf = new Configuration();

    FileSystem fs = FileSystem.get(rawEigenvectors.toUri(), conf);
    SequenceFile.Reader reader = new SequenceFile.Reader(fs, rawEigenvectors, conf);
    try {
View Full Code Here

  }

  @Test
  public void testDistributedLanczosSolverEVJCLI() throws Exception {
    Path testData = getTestTempDirPath("testdata");
    DistributedRowMatrix corpus =
        new TestDistributedRowMatrix().randomDistributedMatrix(500, 450, 500, 10, 10.0, true, testData.toString());
    corpus.configure(new JobConf());
    Path output = getTestTempDirPath("output");
    Path tmp = getTestTempDirPath("tmp");
    String[] args = {
        "-i", new Path(testData, "distMatrix").toString(),
        "-o", output.toString(),
        "--tempDir", tmp.toString(),
        "--numRows", "500",
        "--numCols", "500",
        "--rank", "10",
        "--symmetric", "true",
        "--cleansvd", "true"
    };
    new DistributedLanczosSolver().new DistributedLanczosSolverJob().run(args);
 
    Path cleanEigenvectors = new Path(output, EigenVerificationJob.CLEAN_EIGENVECTORS);
    Matrix eigenVectors = new DenseMatrix(10, corpus.numCols());
    Configuration conf = new Configuration();
 
    FileSystem fs = FileSystem.get(cleanEigenvectors.toUri(), conf);
    SequenceFile.Reader reader = new SequenceFile.Reader(fs, cleanEigenvectors, conf);
    try {
View Full Code Here

public final class TestDistributedLanczosSolver extends SolverTest {

  private void doTestDistributedLanczosSolver(boolean symmetric) throws IOException {
    File testData = getTestTempDir("testdata");
    DistributedRowMatrix corpus = new TestDistributedRowMatrix().randomDistributedMatrix(500,
        450, 400, 10, 10.0, symmetric, testData.getAbsolutePath());
    corpus.configure(new JobConf());
    DistributedLanczosSolver solver = new DistributedLanczosSolver();
    int desiredRank = 30;
    Matrix eigenVectors = new DenseMatrix(desiredRank, corpus.numCols());
    List<Double> eigenValues = new ArrayList<Double>();
    solver.solve(corpus, desiredRank, eigenVectors, eigenValues, symmetric);
    assertOrthonormal(eigenVectors);
    assertEigen(eigenVectors, corpus, eigenVectors.numRows() / 2, 0.01, symmetric);
  }
View Full Code Here

    // set the instance variables
    // create a few new Paths for temp files and transformations
    Path outputCalc = new Path(output, "calculations");
    Path outputTmp = new Path(output, "temporary");

    DistributedRowMatrix A = AffinityMatrixInputJob.runJob(input, outputCalc, dimensions);
    Vector D = MatrixDiagonalizeJob.runJob(A.getRowPath(), dimensions);

    long numCuts;
    do {
      // first three steps are the same as spectral k-means:
      // 1) calculate D from A
      // 2) calculate L = D^-0.5 * A * D^-0.5
      // 3) calculate eigenvectors of L

      DistributedRowMatrix L =
          VectorMatrixMultiplicationJob.runJob(A.getRowPath(), D,
              new Path(outputCalc, "laplacian-" + (System.nanoTime() & 0xFF)));
      L.setConf(new Configuration(conf));

      // eigendecomposition (step 3)
      int overshoot = (int) ((double) eigenrank * OVERSHOOT_MULTIPLIER);
      LanczosState state = new LanczosState(L, eigenrank,
          new DistributedLanczosSolver().getInitialVector(L));

      DistributedRowMatrix U = performEigenDecomposition(conf, L, state, eigenrank, overshoot, outputCalc);
      U.setConf(new Configuration(conf));
      List<Double> eigenValues = Lists.newArrayList();
      for(int i=0; i<eigenrank; i++) {
        eigenValues.set(i, state.getSingularValue(i));
      }

      // here's where things get interesting: steps 4, 5, and 6 are unique
      // to this algorithm, and depending on the final output, steps 1-3
      // may be repeated as well

      // helper method, since apparently List and Vector objects don't play nicely
      Vector evs = listToVector(eigenValues);

      // calculate sensitivities (step 4 and step 5)
      Path sensitivities = new Path(outputCalc, "sensitivities-" + (System.nanoTime() & 0xFF));
      EigencutsSensitivityJob.runJob(evs, D, U.getRowPath(), halflife, tau, median(D), epsilon, sensitivities);

      // perform the cuts (step 6)
      input = new Path(outputTmp, "nextAff-" + (System.nanoTime() & 0xFF));
      numCuts = EigencutsAffinityCutsJob.runjob(A.getRowPath(), sensitivities, input, conf);

      // how many cuts were made?
      if (numCuts > 0) {
        // recalculate A
        A = new DistributedRowMatrix(input,
                                     new Path(outputTmp, Long.toString(System.nanoTime())), dimensions, dimensions);
        A.setConf(new Configuration());
      }
    } while (numCuts > 0);

View Full Code Here

    // now run the verifier to trim down the number of eigenvectors
    EigenVerificationJob verifier = new EigenVerificationJob();
    Path verifiedEigens = new Path(tmp, "verifiedeigens");
    verifier.runJob(conf, seqFiles, input.getRowPath(), verifiedEigens, false, 1.0, numEigenVectors);
    Path cleanedEigens = verifier.getCleanedEigensPath();
    return new DistributedRowMatrix(cleanedEigens, new Path(cleanedEigens, "tmp"), numEigenVectors, input.numRows());
  }
View Full Code Here

    Path affSeqFiles = new Path(outputCalc, "seqfile-" + (System.nanoTime() & 0xFF));
    AffinityMatrixInputJob.runJob(input, affSeqFiles, numDims, numDims);

    // Next step: construct the affinity matrix using the newly-created
    // sequence files
    DistributedRowMatrix A = new DistributedRowMatrix(affSeqFiles,
                                                      new Path(outputTmp, "afftmp-" + (System.nanoTime() & 0xFF)),
                                                      numDims,
                                                      numDims);
    Configuration depConf = new Configuration(conf);
    A.setConf(depConf);

    // Next step: construct the diagonal matrix D (represented as a vector)
    // and calculate the normalized Laplacian of the form:
    // L = D^(-0.5)AD^(-0.5)
    Vector D = MatrixDiagonalizeJob.runJob(affSeqFiles, numDims);
    DistributedRowMatrix L =
        VectorMatrixMultiplicationJob.runJob(affSeqFiles, D,
            new Path(outputCalc, "laplacian-" + (System.nanoTime() & 0xFF)), new Path(outputCalc, "laplacian-tmp-" + (System.nanoTime() & 0xFF)));
    L.setConf(depConf);

    // Next step: perform eigen-decomposition using LanczosSolver
    // since some of the eigen-output is spurious and will be eliminated
    // upon verification, we have to aim to overshoot and then discard
    // unnecessary vectors later
    int overshoot = (int) ((double) clusters * OVERSHOOT_MULTIPLIER);
    DistributedLanczosSolver solver = new DistributedLanczosSolver();
    LanczosState state = new LanczosState(L, numDims, solver.getInitialVector(L));
    Path lanczosSeqFiles = new Path(outputCalc, "eigenvectors-" + (System.nanoTime() & 0xFF));
    solver.runJob(conf,
                  state,
                  overshoot,
                  true,
                  lanczosSeqFiles.toString());

    // perform a verification
    EigenVerificationJob verifier = new EigenVerificationJob();
    Path verifiedEigensPath = new Path(outputCalc, "eigenverifier");
    verifier.runJob(conf, lanczosSeqFiles, L.getRowPath(), verifiedEigensPath, true, 1.0, clusters);
    Path cleanedEigens = verifier.getCleanedEigensPath();
    DistributedRowMatrix W = new DistributedRowMatrix(cleanedEigens, new Path(cleanedEigens, "tmp"), clusters, numDims);
    W.setConf(depConf);
    DistributedRowMatrix Wtrans = W.transpose();
    //    DistributedRowMatrix Wt = W.transpose();

    // next step: normalize the rows of Wt to unit length
    Path unitVectors = new Path(outputCalc, "unitvectors-" + (System.nanoTime() & 0xFF));
    UnitVectorizerJob.runJob(Wtrans.getRowPath(), unitVectors);
    DistributedRowMatrix Wt = new DistributedRowMatrix(unitVectors, new Path(unitVectors, "tmp"), clusters, numDims);
    Wt.setConf(depConf);

    // Finally, perform k-means clustering on the rows of L (or W)
    // generate random initial clusters
    Path initialclusters = RandomSeedGenerator.buildRandom(conf,
                                                           Wt.getRowPath(),
                                                           new Path(output, Cluster.INITIAL_CLUSTERS_DIR),
                                                           clusters,
                                                           measure);
   
    // The output format is the same as the K-means output format.
    // TODO: Perhaps a conversion of the output format from points and clusters
    // in eigenspace to the original dataset. Currently, the user has to perform
    // the association step after this job finishes on their own.
    KMeansDriver.run(conf,
                     Wt.getRowPath(),
                     initialclusters,
                     output,
                     measure,
                     convergenceDelta,
                     maxIterations,
View Full Code Here

    // set the instance variables
    // create a few new Paths for temp files and transformations
    Path outputCalc = new Path(output, "calculations");
    Path outputTmp = new Path(output, "temporary");

    DistributedRowMatrix A = AffinityMatrixInputJob.runJob(input, outputCalc, dimensions);
    Vector D = MatrixDiagonalizeJob.runJob(A.getRowPath(), dimensions);

    long numCuts;
    do {
      // first three steps are the same as spectral k-means:
      // 1) calculate D from A
      // 2) calculate L = D^-0.5 * A * D^-0.5
      // 3) calculate eigenvectors of L

      DistributedRowMatrix L =
          VectorMatrixMultiplicationJob.runJob(A.getRowPath(), D,
              new Path(outputCalc, "laplacian-" + (System.nanoTime() & 0xFF)));
      L.setConf(new Configuration(conf));

      // eigendecomposition (step 3)
      int overshoot = (int) ((double) eigenrank * OVERSHOOT_MULTIPLIER);
      LanczosState state = new LanczosState(L, eigenrank,
          DistributedLanczosSolver.getInitialVector(L));

      DistributedRowMatrix U = performEigenDecomposition(conf, L, state, eigenrank, overshoot, outputCalc);
      U.setConf(new Configuration(conf));
      List<Double> eigenValues = Lists.newArrayList();
      for (int i = 0; i < eigenrank; i++) {
        eigenValues.set(i, state.getSingularValue(i));
      }

      // here's where things get interesting: steps 4, 5, and 6 are unique
      // to this algorithm, and depending on the final output, steps 1-3
      // may be repeated as well

      // helper method, since apparently List and Vector objects don't play nicely
      Vector evs = listToVector(eigenValues);

      // calculate sensitivities (step 4 and step 5)
      Path sensitivities = new Path(outputCalc, "sensitivities-" + (System.nanoTime() & 0xFF));
      EigencutsSensitivityJob.runJob(evs, D, U.getRowPath(), halflife, tau, median(D), epsilon, sensitivities);

      // perform the cuts (step 6)
      input = new Path(outputTmp, "nextAff-" + (System.nanoTime() & 0xFF));
      numCuts = EigencutsAffinityCutsJob.runjob(A.getRowPath(), sensitivities, input, conf);

      // how many cuts were made?
      if (numCuts > 0) {
        // recalculate A
        A = new DistributedRowMatrix(input,
                                     new Path(outputTmp, Long.toString(System.nanoTime())), dimensions, dimensions);
        A.setConf(new Configuration());
      }
    } while (numCuts > 0);

View Full Code Here

    // now run the verifier to trim down the number of eigenvectors
    EigenVerificationJob verifier = new EigenVerificationJob();
    Path verifiedEigens = new Path(tmp, "verifiedeigens");
    verifier.runJob(conf, seqFiles, input.getRowPath(), verifiedEigens, false, 1.0, numEigenVectors);
    Path cleanedEigens = verifier.getCleanedEigensPath();
    return new DistributedRowMatrix(cleanedEigens, new Path(cleanedEigens, "tmp"), numEigenVectors, input.numRows());
  }
View Full Code Here

                       int numCols,
                       Vector b,
                       Preconditioner preconditioner,
                       int maxIterations,
                       double maxError) {
    DistributedRowMatrix matrix = new DistributedRowMatrix(inputPath, tempPath, numRows, numCols);
    matrix.setConf(conf);
       
    return solve(matrix, b, preconditioner, maxIterations, maxError);
  }
View Full Code Here

TOP

Related Classes of org.apache.mahout.math.hadoop.DistributedRowMatrix

Copyright © 2018 www.massapicom. All rights reserved.
All source code are property of their respective owners. Java is a trademark of Sun Microsystems, Inc and owned by ORACLE Inc. Contact coftware#gmail.com.