Package org.apache.mahout.math.hadoop

Examples of org.apache.mahout.math.hadoop.DistributedRowMatrix


  private static final Logger log = LoggerFactory.getLogger(TestDistributedLanczosSolverCLI.class);

  @Test
  public void testDistributedLanczosSolverCLI() throws Exception {
    Path testData = getTestTempDirPath("testdata");
    DistributedRowMatrix corpus =
        new TestDistributedRowMatrix().randomDenseHierarchicalDistributedMatrix(10, 9, false,
            testData.toString());
    corpus.setConf(new Configuration());
    Path output = getTestTempDirPath("output");
    Path tmp = getTestTempDirPath("tmp");
    Path workingDir = getTestTempDirPath("working");
    String[] args = {
        "-i", new Path(testData, "distMatrix").toString(),
        "-o", output.toString(),
        "--tempDir", tmp.toString(),
        "--numRows", "10",
        "--numCols", "9",
        "--rank", "6",
        "--symmetric", "false",
        "--workingDir", workingDir.toString()
    };
    new DistributedLanczosSolver().new DistributedLanczosSolverJob().run(args);

    output = getTestTempDirPath("output2");
    tmp = getTestTempDirPath("tmp2");
    args = new String[] {
        "-i", new Path(testData, "distMatrix").toString(),
        "-o", output.toString(),
        "--tempDir", tmp.toString(),
        "--numRows", "10",
        "--numCols", "9",
        "--rank", "7",
        "--symmetric", "false",
        "--workingDir", workingDir.toString()
    };
    new DistributedLanczosSolver().new DistributedLanczosSolverJob().run(args);

    Path rawEigenvectors = new Path(output, DistributedLanczosSolver.RAW_EIGENVECTORS);
    Matrix eigenVectors = new DenseMatrix(7, corpus.numCols());
    Configuration conf = new Configuration();

    int i = 0;
    for (VectorWritable value : new SequenceFileValueIterable<VectorWritable>(rawEigenvectors, conf)) {
      Vector v = value.get();
View Full Code Here


  }

  @Test
  public void testDistributedLanczosSolverEVJCLI() throws Exception {
    Path testData = getTestTempDirPath("testdata");
    DistributedRowMatrix corpus = new TestDistributedRowMatrix()
        .randomDenseHierarchicalDistributedMatrix(10, 9, false, testData.toString());
    corpus.setConf(new Configuration());
    Path output = getTestTempDirPath("output");
    Path tmp = getTestTempDirPath("tmp");
    String[] args = {
        "-i", new Path(testData, "distMatrix").toString(),
        "-o", output.toString(),
        "--tempDir", tmp.toString(),
        "--numRows", "10",
        "--numCols", "9",
        "--rank", "6",
        "--symmetric", "false",
        "--cleansvd", "true"
    };
    new DistributedLanczosSolver().new DistributedLanczosSolverJob().run(args);
 
    Path cleanEigenvectors = new Path(output, EigenVerificationJob.CLEAN_EIGENVECTORS);
    Matrix eigenVectors = new DenseMatrix(6, corpus.numCols());
    Collection<Double> eigenvalues = Lists.newArrayList();

    output = getTestTempDirPath("output2");
    tmp = getTestTempDirPath("tmp2");
    args = new String[] {
        "-i", new Path(testData, "distMatrix").toString(),
        "-o", output.toString(),
        "--tempDir", tmp.toString(),
        "--numRows", "10",
        "--numCols", "9",
        "--rank", "7",
        "--symmetric", "false",
        "--cleansvd", "true"
    };
    new DistributedLanczosSolver().new DistributedLanczosSolverJob().run(args);
    Path cleanEigenvectors2 = new Path(output, EigenVerificationJob.CLEAN_EIGENVECTORS);
    Matrix eigenVectors2 = new DenseMatrix(7, corpus.numCols());
    Configuration conf = new Configuration();
    Collection<Double> newEigenValues = Lists.newArrayList();

    int i = 0;
    for (VectorWritable value : new SequenceFileValueIterable<VectorWritable>(cleanEigenvectors, conf)) {
View Full Code Here

   
    Path cleanEigenvectors = new Path(output,
        EigenVerificationJob.CLEAN_EIGENVECTORS);
   
    // now multiply the testdata matrix and the eigenvector matrix
    DistributedRowMatrix svdT = new DistributedRowMatrix(cleanEigenvectors,
        tmp, desiredRank, sampleDimension);
    Configuration conf = new Configuration(config);
    svdT.setConf(conf);
    DistributedRowMatrix a = new DistributedRowMatrix(testData, tmp,
        sampleData.size(), sampleDimension);
    a.setConf(conf);
    DistributedRowMatrix sData = a.transpose().times(svdT.transpose());
    sData.setConf(conf);
   
    // now run the Canopy job to prime kMeans canopies
    CanopyDriver.run(conf, sData.getRowPath(), output, measure, 8, 4, false,
        true);
    // now run the KMeans job
    KMeansDriver.run(sData.getRowPath(), new Path(output, "clusters-0"),
        output, measure, 0.001, 10, true, true);
    // run ClusterDumper
    ClusterDumper clusterDumper = new ClusterDumper(finalClusterPath(conf,
        output, 10), new Path(output, "clusteredPoints"));
    clusterDumper.printClusters(termDictionary);
View Full Code Here

        0.0, true, conf);
    Path cleanEigenvectors = new Path(output,
        EigenVerificationJob.CLEAN_EIGENVECTORS);
   
    // now multiply the testdata matrix and the eigenvector matrix
    DistributedRowMatrix svdT = new DistributedRowMatrix(cleanEigenvectors,
        tmp, desiredRank, sampleDimension);
    svdT.setConf(conf);
    DistributedRowMatrix a = new DistributedRowMatrix(testData, tmp,
        sampleData.size(), sampleDimension);
    a.setConf(conf);
    DistributedRowMatrix sData = a.transpose().times(svdT.transpose());
    sData.setConf(conf);
   
    // now run the Canopy job to prime kMeans canopies
    CanopyDriver.run(conf, sData.getRowPath(), output, measure, 8, 4, false,
        true);
    // now run the KMeans job
    KMeansDriver.run(sData.getRowPath(), new Path(output, "clusters-0"),
        output, measure, 0.001, 10, true, true);
    // run ClusterDumper
    ClusterDumper clusterDumper = new ClusterDumper(finalClusterPath(conf,
        output, 10), new Path(output, "clusteredPoints"));
    clusterDumper.printClusters(termDictionary);
View Full Code Here

    job.setJarByClass(VectorMatrixMultiplicationJob.class);

    job.waitForCompletion(true);

    // build the resulting DRM from the results
    return new DistributedRowMatrix(outputPath, tmpPath,
        diag.size(), diag.size());
  }
View Full Code Here

   */
  public static DistributedRowMatrix runJob(Path input, Path output, int dimensions)
    throws IOException, InterruptedException, ClassNotFoundException {
    Path seqFiles = new Path(output, "seqfiles-" + (System.nanoTime() & 0xFF));
    runJob(input, seqFiles, dimensions, dimensions);
    DistributedRowMatrix a = new DistributedRowMatrix(seqFiles,
        new Path(seqFiles, "seqtmp-" + (System.nanoTime() & 0xFF)),
        dimensions, dimensions);
    a.setConf(new Configuration());
    return a;
  }
View Full Code Here

    this.minEigenValue = minEigenValue;

    if (eigenInput != null && eigensToVerify == null) {
      prepareEigens(config, eigenInput, inMemory);
    }
    DistributedRowMatrix c = new DistributedRowMatrix(corpusInput, tempOut, 1, 1);
    c.configure(config);
    corpus = c;

    // set up eigenverifier and orthoverifier TODO: allow multithreaded execution

    eigenVerifier = new SimpleEigenVerifier();
View Full Code Here

    }
    return eigenMetaData;
  }

  private void prepareEigens(JobConf conf, Path eigenInput, boolean inMemory) {
    DistributedRowMatrix eigens = new DistributedRowMatrix(eigenInput, tmpOut, 1, 1);
    eigens.configure(conf);
    if (inMemory) {
      List<Vector> eigenVectors = new ArrayList<Vector>();
      for (MatrixSlice slice : eigens) {
        eigenVectors.add(slice.vector());
      }
View Full Code Here

    this.maxError = maxError;
    if (eigenInput != null && eigensToVerify == null) {
      prepareEigens(new JobConf(conf), eigenInput, inMemory);
    }

    DistributedRowMatrix c = new DistributedRowMatrix(corpusInput, tmpOut, 1, 1);
    c.configure(new JobConf(conf));
    corpus = c;

    eigenVerifier = new SimpleEigenVerifier();
    //OrthonormalityVerifier orthoVerifier = new OrthonormalityVerifier();
    VectorIterable pairwiseInnerProducts = computePairwiseInnerProducts();
View Full Code Here

  public void runJob(Configuration originalConfig, Path inputPath,
            Path outputTmpPath, int numRows, int numCols,
            boolean isSymmetric, int desiredRank, Matrix eigenVectors,
            List<Double> eigenValues, String outputEigenVectorPathString)
            throws IOException {
    DistributedRowMatrix matrix = new DistributedRowMatrix(
                      inputPath, outputTmpPath,
                      numRows, numCols);
    matrix.configure(new JobConf(originalConfig));
    setConf(originalConfig);
    solve(matrix, desiredRank, eigenVectors, eigenValues, isSymmetric);
    serializeOutput(eigenVectors, eigenValues, new Path(outputEigenVectorPathString));
  }
View Full Code Here

TOP

Related Classes of org.apache.mahout.math.hadoop.DistributedRowMatrix

Copyright © 2018 www.massapicom. All rights reserved.
All source code are property of their respective owners. Java is a trademark of Sun Microsystems, Inc and owned by ORACLE Inc. Contact coftware#gmail.com.