Examples of org.apache.mahout.math.hadoop.DistributedRowMatrix

org.apache.mahout.math.hadoop.DistributedRowMatrix
path must already contain an already created SequenceFile! DistributedRowMatrix m = new DistributedRowMatrix("path/to/vector/sequenceFile", "tmp/path", 10000000, 250000); m.configure(new JobConf()); // now if we want to multiply a vector by this matrix, it's dimension must equal the row dimension of this // matrix. If we want to timesSquared() a vector by this matrix, its dimension must equal the column dimension // of the matrix. Vector v = new DenseVector(250000); // now the following operation will be done via a M/R pass via Hadoop. Vector w = m.timesSquared(v);

                             int numRows,
                             int numCols,
                             boolean isSymmetric,
                             int desiredRank,
                             String outputEigenVectorPathString) throws IOException {
    DistributedRowMatrix matrix = new DistributedRowMatrix(inputPath, outputTmpPath, numRows, numCols);
    matrix.setConf(new Configuration(originalConfig));
    LanczosState state = new LanczosState(matrix, desiredRank, getInitialVector(matrix));
    return runJob(originalConfig, state, desiredRank, isSymmetric, outputEigenVectorPathString);
  }

View Full Code Here

                 Path workingDirPath,
                 int numRows,
                 int numCols,
                 boolean isSymmetric,
                 int desiredRank) throws Exception {
    DistributedRowMatrix matrix = new DistributedRowMatrix(inputPath, outputTmpPath, numRows, numCols);
    matrix.setConf(new Configuration(getConf() != null ? getConf() : new Configuration()));


    LanczosState state;
    if (workingDirPath == null) {
      state = new LanczosState(matrix, desiredRank, getInitialVector(matrix));
    } else {
      HdfsBackedLanczosState hState =
          new HdfsBackedLanczosState(matrix, desiredRank, getInitialVector(matrix), workingDirPath);
      hState.setConf(matrix.getConf());
      state = hState;
    }
    solve(state, desiredRank, isSymmetric);


    Path outputEigenVectorPath = new Path(outputPath, RAW_EIGENVECTORS);

View Full Code Here

    this.minEigenValue = minEigenValue;


    if (eigenInput != null && eigensToVerify == null) {
      prepareEigens(conf, eigenInput, inMemory);
    }
    DistributedRowMatrix c = new DistributedRowMatrix(corpusInput, tempOut, 1, 1);
    c.setConf(conf);
    corpus = c;


    // set up eigenverifier and orthoverifier TODO: allow multithreaded execution


    eigenVerifier = new SimpleEigenVerifier();

View Full Code Here

    }
    return eigenMetaData;
  }


  private void prepareEigens(Configuration conf, Path eigenInput, boolean inMemory) {
    DistributedRowMatrix eigens = new DistributedRowMatrix(eigenInput, tmpOut, 1, 1);
    eigens.setConf(conf);
    if (inMemory) {
      List<Vector> eigenVectors = Lists.newArrayList();
      for (MatrixSlice slice : eigens) {
        eigenVectors.add(slice.vector());
      }

View Full Code Here

    this.maxError = maxError;
    if (eigenInput != null && eigensToVerify == null) {
      prepareEigens(new Configuration(conf), eigenInput, inMemory);
    }


    DistributedRowMatrix c = new DistributedRowMatrix(corpusInput, tmpOut, 1, 1);
    c.setConf(new Configuration(conf));
    corpus = c;


    eigenVerifier = new SimpleEigenVerifier();


    Map<MatrixSlice, EigenStatus> eigenMetaData = verifyEigens();

View Full Code Here

    Path affSeqFiles = new Path(outputCalc, "seqfile-" + (System.nanoTime() & 0xFF));
    AffinityMatrixInputJob.runJob(input, affSeqFiles, numDims, numDims);


    // Next step: construct the affinity matrix using the newly-created
    // sequence files
    DistributedRowMatrix A = new DistributedRowMatrix(affSeqFiles,
                                                      new Path(outputTmp, "afftmp-" + (System.nanoTime() & 0xFF)),
                                                      numDims,
                                                      numDims);
    Configuration depConf = new Configuration(conf);
    A.setConf(depConf);


    // Next step: construct the diagonal matrix D (represented as a vector)
    // and calculate the normalized Laplacian of the form:
    // L = D^(-0.5)AD^(-0.5)
    Vector D = MatrixDiagonalizeJob.runJob(affSeqFiles, numDims);
    DistributedRowMatrix L =
        VectorMatrixMultiplicationJob.runJob(affSeqFiles, D,
            new Path(outputCalc, "laplacian-" + (System.nanoTime() & 0xFF)), new Path(outputCalc, "laplacian-tmp-" + (System.nanoTime() & 0xFF)));
    L.setConf(depConf);


    // Next step: perform eigen-decomposition using LanczosSolver
    // since some of the eigen-output is spurious and will be eliminated
    // upon verification, we have to aim to overshoot and then discard
    // unnecessary vectors later
    int overshoot = (int) ((double) clusters * OVERSHOOT_MULTIPLIER);
    DistributedLanczosSolver solver = new DistributedLanczosSolver();
    LanczosState state = new LanczosState(L, clusters, solver.getInitialVector(L));
    Path lanczosSeqFiles = new Path(outputCalc, "eigenvectors-" + (System.nanoTime() & 0xFF));
    solver.runJob(conf,
                  state,
                  overshoot,
                  true,
                  lanczosSeqFiles.toString());


    // perform a verification
    EigenVerificationJob verifier = new EigenVerificationJob();
    Path verifiedEigensPath = new Path(outputCalc, "eigenverifier");
    verifier.runJob(conf, lanczosSeqFiles, L.getRowPath(), verifiedEigensPath, true, 1.0, clusters);
    Path cleanedEigens = verifier.getCleanedEigensPath();
    DistributedRowMatrix W = new DistributedRowMatrix(cleanedEigens, new Path(cleanedEigens, "tmp"), clusters, numDims);
    W.setConf(depConf);
    DistributedRowMatrix Wtrans = W.transpose();
    //    DistributedRowMatrix Wt = W.transpose();


    // next step: normalize the rows of Wt to unit length
    Path unitVectors = new Path(outputCalc, "unitvectors-" + (System.nanoTime() & 0xFF));
    UnitVectorizerJob.runJob(Wtrans.getRowPath(), unitVectors);
    DistributedRowMatrix Wt = new DistributedRowMatrix(unitVectors, new Path(unitVectors, "tmp"), clusters, numDims);
    Wt.setConf(depConf);


    // Finally, perform k-means clustering on the rows of L (or W)
    // generate random initial clusters
    Path initialclusters = RandomSeedGenerator.buildRandom(conf,
                                                           Wt.getRowPath(),
                                                           new Path(output, Cluster.INITIAL_CLUSTERS_DIR),
                                                           clusters,
                                                           measure);
    
    // The output format is the same as the K-means output format.
    // TODO: Perhaps a conversion of the output format from points and clusters
    // in eigenspace to the original dataset. Currently, the user has to perform
    // the association step after this job finishes on their own.
    KMeansDriver.run(conf,
                     Wt.getRowPath(),
                     initialclusters,
                     output,
                     measure,
                     convergenceDelta,
                     maxIterations,

View Full Code Here

                       int numCols, 
                       Vector b, 
                       Preconditioner preconditioner, 
                       int maxIterations, 
                       double maxError) {
    DistributedRowMatrix matrix = new DistributedRowMatrix(inputPath, tempPath, numRows, numCols);
    matrix.setConf(conf);
        
    return solve(matrix, b, preconditioner, maxIterations, maxError);
  }

View Full Code Here

    if (!succeeded) {
      throw new IllegalStateException("Job failed!");
    }


    // build the resulting DRM from the results
    return new DistributedRowMatrix(outputPath, tmpPath,
        diag.size(), diag.size());
  }

View Full Code Here

    // set the instance variables
    // create a few new Paths for temp files and transformations
    Path outputCalc = new Path(output, "calculations");
    Path outputTmp = new Path(output, "temporary");


    DistributedRowMatrix A = AffinityMatrixInputJob.runJob(input, outputCalc, dimensions);
    Vector D = MatrixDiagonalizeJob.runJob(A.getRowPath(), dimensions);


    long numCuts;
    do {
      // first three steps are the same as spectral k-means:
      // 1) calculate D from A
      // 2) calculate L = D^-0.5 * A * D^-0.5
      // 3) calculate eigenvectors of L


      DistributedRowMatrix L =
          VectorMatrixMultiplicationJob.runJob(A.getRowPath(), D,
              new Path(outputCalc, "laplacian-" + (System.nanoTime() & 0xFF)));
      L.setConf(new Configuration(conf));


      // eigendecomposition (step 3)
      int overshoot = (int) ((double) eigenrank * OVERSHOOT_MULTIPLIER);
      LanczosState state = new LanczosState(L, eigenrank,
          new DistributedLanczosSolver().getInitialVector(L));


      DistributedRowMatrix U = performEigenDecomposition(conf, L, state, eigenrank, overshoot, outputCalc);
      U.setConf(new Configuration(conf));
      List<Double> eigenValues = Lists.newArrayList();
      for (int i=0; i<eigenrank; i++) {
        eigenValues.set(i, state.getSingularValue(i));
      }


      // here's where things get interesting: steps 4, 5, and 6 are unique
      // to this algorithm, and depending on the final output, steps 1-3
      // may be repeated as well


      // helper method, since apparently List and Vector objects don't play nicely
      Vector evs = listToVector(eigenValues);


      // calculate sensitivities (step 4 and step 5)
      Path sensitivities = new Path(outputCalc, "sensitivities-" + (System.nanoTime() & 0xFF));
      EigencutsSensitivityJob.runJob(evs, D, U.getRowPath(), halflife, tau, median(D), epsilon, sensitivities);


      // perform the cuts (step 6)
      input = new Path(outputTmp, "nextAff-" + (System.nanoTime() & 0xFF));
      numCuts = EigencutsAffinityCutsJob.runjob(A.getRowPath(), sensitivities, input, conf);


      // how many cuts were made?
      if (numCuts > 0) {
        // recalculate A
        A = new DistributedRowMatrix(input,
                                     new Path(outputTmp, Long.toString(System.nanoTime())), dimensions, dimensions);
        A.setConf(new Configuration());
      }
    } while (numCuts > 0);

View Full Code Here

    // now run the verifier to trim down the number of eigenvectors
    EigenVerificationJob verifier = new EigenVerificationJob();
    Path verifiedEigens = new Path(tmp, "verifiedeigens");
    verifier.runJob(conf, seqFiles, input.getRowPath(), verifiedEigens, false, 1.0, numEigenVectors);
    Path cleanedEigens = verifier.getCleanedEigensPath();
    return new DistributedRowMatrix(cleanedEigens, new Path(cleanedEigens, "tmp"), numEigenVectors, input.numRows());
  }

View Full Code Here

0 1 2 3 4 5 6 7 8 9

TOP

Related Classes of org.apache.mahout.math.hadoop.DistributedRowMatrix

org.apache.hadoop.conf.Configuration

org.apache.hadoop.fs.FileSystem

org.apache.hadoop.fs.Path

org.apache.hadoop.mapred.JobConf

org.apache.hadoop.mapreduce.Job

org.apache.mahout.clustering.spectral.AffinityMatrixInputJob

org.apache.mahout.clustering.spectral.common.AffinityMatrixInputJob

org.apache.mahout.clustering.spectral.common.VectorMatrixMultiplicationJob

org.apache.mahout.clustering.spectral.eigencuts.EigencutsDriver

org.apache.mahout.clustering.spectral.kmeans.SpectralKMeansDriver

All source code are property of their respective owners. Java is a trademark of Sun Microsystems, Inc and owned by ORACLE Inc. Contact coftware#gmail.com.