Package de.jungblut.clustering.mapreduce

Source Code of de.jungblut.clustering.mapreduce.KMeansClusteringJob

package de.jungblut.clustering.mapreduce;

import java.io.IOException;

import org.apache.commons.logging.Log;
import org.apache.commons.logging.LogFactory;
import org.apache.hadoop.conf.Configuration;
import org.apache.hadoop.fs.FileStatus;
import org.apache.hadoop.fs.FileSystem;
import org.apache.hadoop.fs.Path;
import org.apache.hadoop.io.IntWritable;
import org.apache.hadoop.io.SequenceFile;
import org.apache.hadoop.mapreduce.Job;
import org.apache.hadoop.mapreduce.lib.input.FileInputFormat;
import org.apache.hadoop.mapreduce.lib.input.SequenceFileInputFormat;
import org.apache.hadoop.mapreduce.lib.output.FileOutputFormat;
import org.apache.hadoop.mapreduce.lib.output.SequenceFileOutputFormat;

import de.jungblut.clustering.model.ClusterCenter;
import de.jungblut.clustering.model.VectorWritable;

@SuppressWarnings("deprecation")
public class KMeansClusteringJob {

  private static final Log LOG = LogFactory.getLog(KMeansClusteringJob.class);

  public static void main(String[] args) throws IOException,
      InterruptedException, ClassNotFoundException {

    int iteration = 1;
    Configuration conf = new Configuration();
    conf.set("num.iteration", iteration + "");

    Path in = new Path("files/clustering/import/data");
    Path center = new Path("files/clustering/import/center/cen.seq");
    conf.set("centroid.path", center.toString());
    Path out = new Path("files/clustering/depth_1");

    Job job = new Job(conf);
    job.setJobName("KMeans Clustering");

    job.setMapperClass(KMeansMapper.class);
    job.setReducerClass(KMeansReducer.class);
    job.setJarByClass(KMeansMapper.class);

    FileInputFormat.addInputPath(job, in);
    FileSystem fs = FileSystem.get(conf);
    if (fs.exists(out)) {
      fs.delete(out, true);
    }

    if (fs.exists(center)) {
      fs.delete(out, true);
    }

    if (fs.exists(in)) {
      fs.delete(in, true);
    }

    writeExampleCenters(conf, center, fs);

    writeExampleVectors(conf, in, fs);

    FileOutputFormat.setOutputPath(job, out);
    job.setInputFormatClass(SequenceFileInputFormat.class);
    job.setOutputFormatClass(SequenceFileOutputFormat.class);

    job.setOutputKeyClass(ClusterCenter.class);
    job.setOutputValueClass(VectorWritable.class);

    job.waitForCompletion(true);

    long counter = job.getCounters()
        .findCounter(KMeansReducer.Counter.CONVERGED).getValue();
    iteration++;
    while (counter > 0) {
      conf = new Configuration();
      conf.set("centroid.path", center.toString());
      conf.set("num.iteration", iteration + "");
      job = new Job(conf);
      job.setJobName("KMeans Clustering " + iteration);

      job.setMapperClass(KMeansMapper.class);
      job.setReducerClass(KMeansReducer.class);
      job.setJarByClass(KMeansMapper.class);

      in = new Path("files/clustering/depth_" + (iteration - 1) + "/");
      out = new Path("files/clustering/depth_" + iteration);

      FileInputFormat.addInputPath(job, in);
      if (fs.exists(out))
        fs.delete(out, true);

      FileOutputFormat.setOutputPath(job, out);
      job.setInputFormatClass(SequenceFileInputFormat.class);
      job.setOutputFormatClass(SequenceFileOutputFormat.class);
      job.setOutputKeyClass(ClusterCenter.class);
      job.setOutputValueClass(VectorWritable.class);

      job.waitForCompletion(true);
      iteration++;
      counter = job.getCounters().findCounter(KMeansReducer.Counter.CONVERGED)
          .getValue();
    }

    Path result = new Path("files/clustering/depth_" + (iteration - 1) + "/");

    FileStatus[] stati = fs.listStatus(result);
    for (FileStatus status : stati) {
      if (!status.isDir()) {
        Path path = status.getPath();
        if (!path.getName().equals("_SUCCESS")) {
          LOG.info("FOUND " + path.toString());
          try (SequenceFile.Reader reader = new SequenceFile.Reader(fs, path,
              conf)) {
            ClusterCenter key = new ClusterCenter();
            VectorWritable v = new VectorWritable();
            while (reader.next(key, v)) {
              LOG.info(key + " / " + v);
            }
          }
        }
      }
    }
  }

  public static void writeExampleVectors(Configuration conf, Path in,
      FileSystem fs) throws IOException {
    try (SequenceFile.Writer dataWriter = SequenceFile.createWriter(fs, conf,
        in, ClusterCenter.class, VectorWritable.class)) {
      dataWriter.append(new ClusterCenter(new VectorWritable(0, 0)),
          new VectorWritable(1, 2));
      dataWriter.append(new ClusterCenter(new VectorWritable(0, 0)),
          new VectorWritable(16, 3));
      dataWriter.append(new ClusterCenter(new VectorWritable(0, 0)),
          new VectorWritable(3, 3));
      dataWriter.append(new ClusterCenter(new VectorWritable(0, 0)),
          new VectorWritable(2, 2));
      dataWriter.append(new ClusterCenter(new VectorWritable(0, 0)),
          new VectorWritable(2, 3));
      dataWriter.append(new ClusterCenter(new VectorWritable(0, 0)),
          new VectorWritable(25, 1));
      dataWriter.append(new ClusterCenter(new VectorWritable(0, 0)),
          new VectorWritable(7, 6));
      dataWriter.append(new ClusterCenter(new VectorWritable(0, 0)),
          new VectorWritable(6, 5));
      dataWriter.append(new ClusterCenter(new VectorWritable(0, 0)),
          new VectorWritable(-1, -23));
    }
  }

  public static void writeExampleCenters(Configuration conf, Path center,
      FileSystem fs) throws IOException {
    try (SequenceFile.Writer centerWriter = SequenceFile.createWriter(fs, conf,
        center, ClusterCenter.class, IntWritable.class)) {
      final IntWritable value = new IntWritable(0);
      centerWriter.append(new ClusterCenter(new VectorWritable(1, 1)), value);
      centerWriter.append(new ClusterCenter(new VectorWritable(5, 5)), value);
    }
  }

}
TOP

Related Classes of de.jungblut.clustering.mapreduce.KMeansClusteringJob

TOP
Copyright © 2018 www.massapi.com. All rights reserved.
All source code are property of their respective owners. Java is a trademark of Sun Microsystems, Inc and owned by ORACLE Inc. Contact coftware#gmail.com.