Package org.apache.mahout.clustering.display

Source Code of org.apache.mahout.clustering.display.DisplayClustering

/*
* Licensed to the Apache Software Foundation (ASF) under one or more
* contributor license agreements.  See the NOTICE file distributed with
* this work for additional information regarding copyright ownership.
* The ASF licenses this file to You under the Apache License, Version 2.0
* (the "License"); you may not use this file except in compliance with
* the License.  You may obtain a copy of the License at
*
*     http://www.apache.org/licenses/LICENSE-2.0
*
* Unless required by applicable law or agreed to in writing, software
* distributed under the License is distributed on an "AS IS" BASIS,
* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
* See the License for the specific language governing permissions and
* limitations under the License.
*/

package org.apache.mahout.clustering.display;

import java.awt.BasicStroke;
import java.awt.Color;
import java.awt.Frame;
import java.awt.Graphics;
import java.awt.Graphics2D;
import java.awt.Toolkit;
import java.awt.event.WindowAdapter;
import java.awt.event.WindowEvent;
import java.awt.geom.AffineTransform;
import java.awt.geom.Ellipse2D;
import java.awt.geom.Rectangle2D;
import java.io.IOException;
import java.util.Collection;
import java.util.HashMap;
import java.util.List;
import java.util.Map;

import org.apache.hadoop.conf.Configuration;
import org.apache.hadoop.fs.FileStatus;
import org.apache.hadoop.fs.FileSystem;
import org.apache.hadoop.fs.Path;
import org.apache.hadoop.io.IntWritable;
import org.apache.hadoop.io.SequenceFile;
import org.apache.hadoop.io.Text;
import org.apache.mahout.clustering.AbstractCluster;
import org.apache.mahout.clustering.Cluster;
import org.apache.mahout.clustering.classify.WeightedVectorWritable;
import org.apache.mahout.clustering.dirichlet.UncommonDistributions;
import org.apache.mahout.clustering.iterator.ClusterWritable;
import org.apache.mahout.common.Pair;
import org.apache.mahout.common.RandomUtils;
import org.apache.mahout.common.iterator.sequencefile.PathFilters;
import org.apache.mahout.common.iterator.sequencefile.PathType;
import org.apache.mahout.common.iterator.sequencefile.SequenceFileDirValueIterable;
import org.apache.mahout.common.iterator.sequencefile.SequenceFileIterable;
import org.apache.mahout.math.DenseVector;
import org.apache.mahout.math.Vector;
import org.apache.mahout.math.VectorWritable;
import org.slf4j.Logger;
import org.slf4j.LoggerFactory;

import com.google.common.collect.Lists;
import com.google.common.io.Closeables;

public class DisplayClustering extends Frame {
 
  private static final Logger log = LoggerFactory.getLogger(DisplayClustering.class);
 
  protected static final int DS = 72; // default scale = 72 pixels per inch
 
  protected static final int SIZE = 8; // screen size in inches
 
  private static final Collection<Vector> SAMPLE_PARAMS = Lists.newArrayList();
 
  protected static final List<VectorWritable> SAMPLE_DATA = Lists.newArrayList();
 
  protected static final List<List<Cluster>> CLUSTERS = Lists.newArrayList();
 
  static final Color[] COLORS = { Color.red, Color.orange, Color.yellow, Color.green, Color.blue, Color.magenta,
    Color.lightGray };
 
  protected static final double T1 = 3.0;
 
  protected static final double T2 = 2.8;
 
  static double significance = 0.05;
 
  protected static int res; // screen resolution
 
  public DisplayClustering() {
    initialize();
    this.setTitle("Sample Data");
  }
 
  public void initialize() {
    // Get screen resolution
    res = Toolkit.getDefaultToolkit().getScreenResolution();
   
    // Set Frame size in inches
    this.setSize(SIZE * res, SIZE * res);
    this.setVisible(true);
    this.setTitle("Asymmetric Sample Data");
   
    // Window listener to terminate program.
    this.addWindowListener(new WindowAdapter() {
      @Override
      public void windowClosing(WindowEvent e) {
        System.exit(0);
      }
    });
  }
 
  public static void main(String[] args) throws Exception {
    RandomUtils.useTestSeed();
    generateSamples();
    new DisplayClustering();
  }
 
  // Override the paint() method
  @Override
  public void paint(Graphics g) {
    Graphics2D g2 = (Graphics2D) g;
    plotSampleData(g2);
    plotSampleParameters(g2);
    plotClusters(g2);
  }
 
  protected static void plotClusters(Graphics2D g2) {
    int cx = CLUSTERS.size() - 1;
    for (List<Cluster> clusters : CLUSTERS) {
      g2.setStroke(new BasicStroke(cx == 0 ? 3 : 1));
      g2.setColor(COLORS[Math.min(COLORS.length - 1, cx--)]);
      for (Cluster cluster : clusters) {
        plotEllipse(g2, cluster.getCenter(), cluster.getRadius().times(3));
      }
    }
  }
 
  protected static void plotSampleParameters(Graphics2D g2) {
    Vector v = new DenseVector(2);
    Vector dv = new DenseVector(2);
    g2.setColor(Color.RED);
    for (Vector param : SAMPLE_PARAMS) {
      v.set(0, param.get(0));
      v.set(1, param.get(1));
      dv.set(0, param.get(2) * 3);
      dv.set(1, param.get(3) * 3);
      plotEllipse(g2, v, dv);
    }
  }
 
  protected static void plotSampleData(Graphics2D g2) {
    double sx = (double) res / DS;
    g2.setTransform(AffineTransform.getScaleInstance(sx, sx));
   
    // plot the axes
    g2.setColor(Color.BLACK);
    Vector dv = new DenseVector(2).assign(SIZE / 2.0);
    plotRectangle(g2, new DenseVector(2).assign(2), dv);
    plotRectangle(g2, new DenseVector(2).assign(-2), dv);
   
    // plot the sample data
    g2.setColor(Color.DARK_GRAY);
    dv.assign(0.03);
    for (VectorWritable v : SAMPLE_DATA) {
      plotRectangle(g2, v.get(), dv);
    }
  }
 
  /**
   * This method plots points and colors them according to their cluster
   * membership, rather than drawing ellipses.
   *
   * As of commit, this method is used only by K-means spectral clustering.
   * Since the cluster assignments are set within the eigenspace of the data, it
   * is not inherent that the original data cluster as they would in K-means:
   * that is, as symmetric gaussian mixtures.
   *
   * Since Spectral K-Means uses K-Means to cluster the eigenspace data, the raw
   * output is not directly usable. Rather, the cluster assignments from the raw
   * output need to be transferred back to the original data. As such, this
   * method will read the SequenceFile cluster results of K-means and transfer
   * the cluster assignments to the original data, coloring them appropriately.
   *
   * @param g2
   * @param data
   */
  protected static void plotClusteredSampleData(Graphics2D g2, Path data) {
    double sx = (double) res / DS;
    g2.setTransform(AffineTransform.getScaleInstance(sx, sx));
   
    g2.setColor(Color.BLACK);
    Vector dv = new DenseVector(2).assign(SIZE / 2.0);
    plotRectangle(g2, new DenseVector(2).assign(2), dv);
    plotRectangle(g2, new DenseVector(2).assign(-2), dv);
   
    // plot the sample data, colored according to the cluster they belong to
    dv.assign(0.03);
   
    Path clusteredPointsPath = new Path(data, "clusteredPoints");
    Path inputPath = new Path(clusteredPointsPath, "part-m-00000");
    Map<Integer,Color> colors = new HashMap<Integer,Color>();
    int point = 0;
    for (Pair<IntWritable,WeightedVectorWritable> record : new SequenceFileIterable<IntWritable,WeightedVectorWritable>(
        inputPath, new Configuration())) {
      int clusterId = record.getFirst().get();
      VectorWritable v = SAMPLE_DATA.get(point++);
      Integer key = clusterId;
      if (!colors.containsKey(key)) {
        colors.put(key, COLORS[Math.min(COLORS.length - 1, colors.size())]);
      }
      plotClusteredRectangle(g2, v.get(), dv, colors.get(key));
    }
  }
 
  /**
   * Identical to plotRectangle(), but with the option of setting the color of
   * the rectangle's stroke.
   *
   * NOTE: This should probably be refactored with plotRectangle() since most of
   * the code here is direct copy/paste from that method.
   *
   * @param g2
   *          A Graphics2D context.
   * @param v
   *          A vector for the rectangle's center.
   * @param dv
   *          A vector for the rectangle's dimensions.
   * @param color
   *          The color of the rectangle's stroke.
   */
  protected static void plotClusteredRectangle(Graphics2D g2, Vector v, Vector dv, Color color) {
    double[] flip = {1, -1};
    Vector v2 = v.times(new DenseVector(flip));
    v2 = v2.minus(dv.divide(2));
    int h = SIZE / 2;
    double x = v2.get(0) + h;
    double y = v2.get(1) + h;
   
    g2.setStroke(new BasicStroke(1));
    g2.setColor(color);
    g2.draw(new Rectangle2D.Double(x * DS, y * DS, dv.get(0) * DS, dv.get(1) * DS));
  }
 
  /**
   * Draw a rectangle on the graphics context
   *
   * @param g2
   *          a Graphics2D context
   * @param v
   *          a Vector of rectangle center
   * @param dv
   *          a Vector of rectangle dimensions
   */
  protected static void plotRectangle(Graphics2D g2, Vector v, Vector dv) {
    double[] flip = {1, -1};
    Vector v2 = v.times(new DenseVector(flip));
    v2 = v2.minus(dv.divide(2));
    int h = SIZE / 2;
    double x = v2.get(0) + h;
    double y = v2.get(1) + h;
    g2.draw(new Rectangle2D.Double(x * DS, y * DS, dv.get(0) * DS, dv.get(1) * DS));
  }
 
  /**
   * Draw an ellipse on the graphics context
   *
   * @param g2
   *          a Graphics2D context
   * @param v
   *          a Vector of ellipse center
   * @param dv
   *          a Vector of ellipse dimensions
   */
  protected static void plotEllipse(Graphics2D g2, Vector v, Vector dv) {
    double[] flip = {1, -1};
    Vector v2 = v.times(new DenseVector(flip));
    v2 = v2.minus(dv.divide(2));
    int h = SIZE / 2;
    double x = v2.get(0) + h;
    double y = v2.get(1) + h;
    g2.draw(new Ellipse2D.Double(x * DS, y * DS, dv.get(0) * DS, dv.get(1) * DS));
  }
 
  protected static void generateSamples() {
    generateSamples(500, 1, 1, 3);
    generateSamples(300, 1, 0, 0.5);
    generateSamples(300, 0, 2, 0.1);
  }
 
  protected static void generate2dSamples() {
    generate2dSamples(500, 1, 1, 3, 1);
    generate2dSamples(300, 1, 0, 0.5, 1);
    generate2dSamples(300, 0, 2, 0.1, 0.5);
  }
 
  /**
   * Generate random samples and add them to the sampleData
   *
   * @param num
   *          int number of samples to generate
   * @param mx
   *          double x-value of the sample mean
   * @param my
   *          double y-value of the sample mean
   * @param sd
   *          double standard deviation of the samples
   */
  protected static void generateSamples(int num, double mx, double my, double sd) {
    double[] params = {mx, my, sd, sd};
    SAMPLE_PARAMS.add(new DenseVector(params));
    log.info("Generating {} samples m=[{}, {}] sd={}", num, mx, my, sd);
    for (int i = 0; i < num; i++) {
      SAMPLE_DATA.add(new VectorWritable(new DenseVector(new double[] {UncommonDistributions.rNorm(mx, sd),
          UncommonDistributions.rNorm(my, sd)})));
    }
  }
 
  protected static void writeSampleData(Path output) throws IOException {
    Configuration conf = new Configuration();
    FileSystem fs = FileSystem.get(output.toUri(), conf);
    SequenceFile.Writer writer = new SequenceFile.Writer(fs, conf, output, Text.class, VectorWritable.class);
    try {
      int i = 0;
      for (VectorWritable vw : SAMPLE_DATA) {
        writer.append(new Text("sample_" + i++), vw);
      }
    } finally {
      Closeables.close(writer, false);
    }
  }
 
  protected static List<Cluster> readClustersWritable(Path clustersIn) {
    List<Cluster> clusters = Lists.newArrayList();
    Configuration conf = new Configuration();
    for (ClusterWritable value : new SequenceFileDirValueIterable<ClusterWritable>(clustersIn, PathType.LIST,
        PathFilters.logsCRCFilter(), conf)) {
      Cluster cluster = value.getValue();
      log.info(
          "Reading Cluster:{} center:{} numPoints:{} radius:{}",
          cluster.getId(), AbstractCluster.formatVector(cluster.getCenter(), null),
          cluster.getNumObservations(), AbstractCluster.formatVector(cluster.getRadius(), null));
      clusters.add(cluster);
    }
    return clusters;
  }
 
  protected static void loadClustersWritable(Path output) throws IOException {
    Configuration conf = new Configuration();
    FileSystem fs = FileSystem.get(output.toUri(), conf);
    for (FileStatus s : fs.listStatus(output, new ClustersFilter())) {
      List<Cluster> clusters = readClustersWritable(s.getPath());
      CLUSTERS.add(clusters);
    }
  }
 
  /**
   * Generate random samples and add them to the sampleData
   *
   * @param num
   *          int number of samples to generate
   * @param mx
   *          double x-value of the sample mean
   * @param my
   *          double y-value of the sample mean
   * @param sdx
   *          double x-value standard deviation of the samples
   * @param sdy
   *          double y-value standard deviation of the samples
   */
  protected static void generate2dSamples(int num, double mx, double my, double sdx, double sdy) {
    double[] params = {mx, my, sdx, sdy};
    SAMPLE_PARAMS.add(new DenseVector(params));
    log.info("Generating {} samples m=[{}, {}] sd=[{}, {}]", num, mx, my, sdx, sdy);
    for (int i = 0; i < num; i++) {
      SAMPLE_DATA.add(new VectorWritable(new DenseVector(new double[] {UncommonDistributions.rNorm(mx, sdx),
          UncommonDistributions.rNorm(my, sdy)})));
    }
  }
 
  protected static boolean isSignificant(Cluster cluster) {
    return (double) cluster.getNumObservations() / SAMPLE_DATA.size() > significance;
  }
}
TOP

Related Classes of org.apache.mahout.clustering.display.DisplayClustering

TOP
Copyright © 2018 www.massapi.com. All rights reserved.
All source code are property of their respective owners. Java is a trademark of Sun Microsystems, Inc and owned by ORACLE Inc. Contact coftware#gmail.com.