Examples of SquaredEuclideanDistanceMeasure


Examples of org.apache.mahout.common.distance.SquaredEuclideanDistanceMeasure

      radius.add(furthest);
    }
    log.warn("Reference data stored");
    log.warn("Starting add with speedup of {}", numDataVectors / (dimension * 2.0 * depth * 4.0));

    Searcher sut = new FastProjectionSearch(new SquaredEuclideanDistanceMeasure(), dimension * 2, depth * 4);
    sut.addAllMatrixSlicesAsWeightedVectors(data);
    log.warn("Added data with speedup of {}", numDataVectors / (dimension * 2.0 * depth * 4.0));

    long t0 = System.nanoTime();
    for (MatrixSlice query : queries) {
View Full Code Here

Examples of org.apache.mahout.common.distance.SquaredEuclideanDistanceMeasure

      mark.createBenchmark();
      mark.incrementalCreateBenchmark();
      mark.cloneBenchmark();
      mark.dotBenchmark();
      mark.distanceMeasureBenchmark(new CosineDistanceMeasure());
      mark.distanceMeasureBenchmark(new SquaredEuclideanDistanceMeasure());
      mark.distanceMeasureBenchmark(new EuclideanDistanceMeasure());
      //mark.distanceMeasureBenchmark(new ManhattanDistanceMeasure());
      mark.distanceMeasureBenchmark(new TanimotoDistanceMeasure());
     
      log.info("\n{}", mark.summarize());
View Full Code Here

Examples of org.apache.mahout.common.distance.SquaredEuclideanDistanceMeasure

      center.update(row);
    }
    // Given the centroid, we can compute \Delta_1^2(X), the total squared distance for the datapoints
    // this accelerates seed selection.
    double radius = 0;
    DistanceMeasure l2 = new SquaredEuclideanDistanceMeasure();
    for (WeightedVector row : datapoints) {
      radius += l2.distance(row, center);
    }

    // Find the first seed c_1 (and conceptually the second, c_2) as might be done in the 2-means clustering so that
    // the probability of selecting c_1 and c_2 is proportional to || c_1 - c_2 ||^2.  This is done
    // by first selecting c_1 with probability:
    //
    // p(c_1) = sum_{c_1} || c_1 - c_2 ||^2 \over sum_{c_1, c_2} || c_1 - c_2 ||^2
    //
    // This can be simplified to:
    //
    // p(c_1) = \Delta_1^2(X) + n || c_1 - c ||^2 / (2 n \Delta_1^2(X))
    //
    // where c = \sum x / n and \Delta_1^2(X) = sum || x - c ||^2
    //
    // All subsequent seeds c_i (including c_2) can then be selected from the remaining points with probability
    // proportional to Pr(c_i == x_j) = min_{m < i} || c_m - x_j ||^2.

    // Multinomial distribution of vector indices for the selection seeds. These correspond to
    // the indices of the vectors in the original datapoints list.
    Multinomial<Integer> seedSelector = new Multinomial<Integer>();
    for (int i = 0; i < datapoints.size(); ++i) {
      double selectionProbability =
          radius + datapoints.size() * l2.distance(datapoints.get(i), center);
      seedSelector.add(i, selectionProbability);
    }

    Centroid c_1 = new Centroid((WeightedVector)datapoints.get(seedSelector.sample()).clone());
    c_1.setIndex(0);
    // Construct a set of weighted things which can be used for random selection.  Initial weights are
    // set to the squared distance from c_1
    for (int i = 0; i < datapoints.size(); ++i) {
      WeightedVector row = datapoints.get(i);
      final double w = l2.distance(c_1, row) * row.getWeight();
      seedSelector.set(i, w);
    }

    // From here, seeds are selected with probablity proportional to:
    //
    // r_i = min_{c_j} || x_i - c_j ||^2
    //
    // when we only have c_1, we have already set these distances and as we select each new
    // seed, we update the minimum distances.
    centroids.add(c_1);
    int clusterIndex = 1;
    while (centroids.size() < numClusters) {
      // Select according to weights.
      int seedIndex = seedSelector.sample();
      Centroid nextSeed = new Centroid((WeightedVector)datapoints.get(seedIndex).clone());
      nextSeed.setIndex(clusterIndex++);
      centroids.add(nextSeed);
      // Don't select this one again.
      seedSelector.set(seedIndex, 0);
      // Re-weight everything according to the minimum distance to a seed.
      for (int currSeedIndex : seedSelector) {
        WeightedVector curr = datapoints.get(currSeedIndex);
        double newWeight = nextSeed.getWeight() * l2.distance(nextSeed, curr);
        if (newWeight < seedSelector.getWeight(currSeedIndex)) {
          seedSelector.set(currSeedIndex, newWeight);
        }
      }
    }
View Full Code Here

Examples of org.apache.mahout.common.distance.SquaredEuclideanDistanceMeasure

      mark.cloneBenchmark();
      mark.dotBenchmark();
      mark.serializeBenchmark();
      mark.deserializeBenchmark();
      mark.distanceMeasureBenchmark(new CosineDistanceMeasure());
      mark.distanceMeasureBenchmark(new SquaredEuclideanDistanceMeasure());
      mark.distanceMeasureBenchmark(new EuclideanDistanceMeasure());
      mark.distanceMeasureBenchmark(new ManhattanDistanceMeasure());
      mark.distanceMeasureBenchmark(new TanimotoDistanceMeasure());
     
      log.info("\n{}", mark.summarize());
View Full Code Here

Examples of org.apache.mahout.common.distance.SquaredEuclideanDistanceMeasure

          new SequenceFileDirValueIterable<VectorWritable>(new Path(trainFile), PathType.GLOB, conf);
      Iterable<Vector> trainDatapoints = IOUtils.getVectorsFromVectorWritableIterable(trainIterable);
      Iterable<Vector> datapoints = trainDatapoints;

      printSummaries(ClusteringUtils.summarizeClusterDistances(trainDatapoints, centroids,
          new SquaredEuclideanDistanceMeasure()), "train");

      // Also adding in the "test" set.
      if (testFile != null) {
        SequenceFileDirValueIterable<VectorWritable> testIterable =
            new SequenceFileDirValueIterable<VectorWritable>(new Path(testFile), PathType.GLOB, conf);
        Iterable<Vector> testDatapoints = IOUtils.getVectorsFromVectorWritableIterable(testIterable);

        printSummaries(ClusteringUtils.summarizeClusterDistances(testDatapoints, centroids,
            new SquaredEuclideanDistanceMeasure()), "test");

        datapoints = Iterables.concat(trainDatapoints, testDatapoints);
      }

      // At this point, all train/test CSVs have been written. We now compute quality metrics.
View Full Code Here

Examples of org.apache.mahout.common.distance.SquaredEuclideanDistanceMeasure

        mapDriver.getConfiguration().get(StreamingKMeansDriver.SEARCHER_CLASS_OPTION));
    for (Centroid datapoint : syntheticData.getFirst()) {
      mapDriver.addInput(new IntWritable(0), new VectorWritable(datapoint));
    }
    List<org.apache.hadoop.mrunit.types.Pair<IntWritable,CentroidWritable>> results = mapDriver.run();
    BruteSearch resultSearcher = new BruteSearch(new SquaredEuclideanDistanceMeasure());
    for (org.apache.hadoop.mrunit.types.Pair<IntWritable, CentroidWritable> result : results) {
      resultSearcher.add(result.getSecond().getCentroid());
    }
    System.out.printf("Clustered the data into %d clusters\n", results.size());
    for (Vector mean : syntheticData.getSecond()) {
View Full Code Here

Examples of org.apache.mahout.common.distance.SquaredEuclideanDistanceMeasure

  private static final int K1 = 100;

  @Test
  public void testClusteringMultipleRuns() {
    for (int i = 1; i <= 10; ++i) {
      BallKMeans clusterer = new BallKMeans(new BruteSearch(new SquaredEuclideanDistanceMeasure()),
          1 << NUM_DIMENSIONS, NUM_ITERATIONS, true, i);
      clusterer.cluster(syntheticData.getFirst());
      double costKMeansPlusPlus = ClusteringUtils.totalClusterCost(syntheticData.getFirst(), clusterer);

      clusterer = new BallKMeans(new BruteSearch(new SquaredEuclideanDistanceMeasure()),
          1 << NUM_DIMENSIONS, NUM_ITERATIONS, false, i);
      clusterer.cluster(syntheticData.getFirst());
      double costKMeansRandom = ClusteringUtils.totalClusterCost(syntheticData.getFirst(), clusterer);

      System.out.printf("%d runs; kmeans++: %f; random: %f\n", i, costKMeansPlusPlus, costKMeansRandom);
View Full Code Here

Examples of org.apache.mahout.common.distance.SquaredEuclideanDistanceMeasure

    }
  }

  @Test
  public void testClustering() {
    UpdatableSearcher searcher = new BruteSearch(new SquaredEuclideanDistanceMeasure());
    BallKMeans clusterer = new BallKMeans(searcher, 1 << NUM_DIMENSIONS, NUM_ITERATIONS);

    long startTime = System.currentTimeMillis();
    clusterer.cluster(syntheticData.getFirst());
    long endTime = System.currentTimeMillis();
View Full Code Here

Examples of org.apache.mahout.common.distance.SquaredEuclideanDistanceMeasure

  public void testInitialization() {
    // Start with super clusterable data.
    List<? extends WeightedVector> data = cubishTestData(0.01);

    // Just do initialization of ball k-means. This should drop a point into each of the clusters.
    BallKMeans r = new BallKMeans(new BruteSearch(new SquaredEuclideanDistanceMeasure()), 6, 20);
    r.cluster(data);

    // Put the centroids into a matrix.
    Matrix x = new DenseMatrix(6, 5);
    int row = 0;
View Full Code Here

Examples of org.apache.mahout.common.distance.SquaredEuclideanDistanceMeasure

  }

  @Parameters
  public static List<Object[]> generateData() {
    return Arrays.asList(new Object[][] {
        {new ProjectionSearch(new SquaredEuclideanDistanceMeasure(), NUM_PROJECTIONS, SEARCH_SIZE), true},
        {new FastProjectionSearch(new SquaredEuclideanDistanceMeasure(), NUM_PROJECTIONS, SEARCH_SIZE),
            true},
        {new ProjectionSearch(new SquaredEuclideanDistanceMeasure(), NUM_PROJECTIONS, SEARCH_SIZE), false},
        {new FastProjectionSearch(new SquaredEuclideanDistanceMeasure(), NUM_PROJECTIONS, SEARCH_SIZE),
            false},
    });
  }
View Full Code Here
TOP
Copyright © 2018 www.massapi.com. All rights reserved.
All source code are property of their respective owners. Java is a trademark of Sun Microsystems, Inc and owned by ORACLE Inc. Contact coftware#gmail.com.