Package org.apache.mahout.clustering.fuzzykmeans

Source Code of org.apache.mahout.clustering.fuzzykmeans.FuzzyKMeansDriver

/**
* Licensed to the Apache Software Foundation (ASF) under one or more
* contributor license agreements.  See the NOTICE file distributed with
* this work for additional information regarding copyright ownership.
* The ASF licenses this file to You under the Apache License, Version 2.0
* (the "License"); you may not use this file except in compliance with
* the License.  You may obtain a copy of the License at
*
*     http://www.apache.org/licenses/LICENSE-2.0
*
* Unless required by applicable law or agreed to in writing, software
* distributed under the License is distributed on an "AS IS" BASIS,
* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
* See the License for the specific language governing permissions and
* limitations under the License.
*/

package org.apache.mahout.clustering.fuzzykmeans;

import java.io.File;
import java.io.IOException;
import java.util.ArrayList;
import java.util.List;

import org.apache.hadoop.fs.FileStatus;
import org.apache.hadoop.fs.FileSystem;
import org.apache.hadoop.fs.FileUtil;
import org.apache.hadoop.fs.Path;
import org.apache.hadoop.fs.PathFilter;
import org.apache.hadoop.io.Text;
import org.apache.hadoop.mapred.FileInputFormat;
import org.apache.hadoop.mapred.FileOutputFormat;
import org.apache.hadoop.mapred.FileSplit;
import org.apache.hadoop.mapred.JobClient;
import org.apache.hadoop.mapred.JobConf;
import org.apache.hadoop.mapred.KeyValueLineRecordReader;
import org.slf4j.Logger;
import org.slf4j.LoggerFactory;

public class FuzzyKMeansDriver {

  private static final Logger log = LoggerFactory
      .getLogger(FuzzyKMeansDriver.class);

  public static final String MAPPER_VALUE_SEPARATOR = "~";

  public static final String COMBINER_VALUE_SEPARATOR = "\t";

  private FuzzyKMeansDriver() {
  }

  private static void printMessage() {
    System.out
        .println("Usage: input clusterIn output measureClass convergenceDelta maxIterations m [doClusteringOnly]");
  }

  public static void main(String[] args) {
    if (args.length < 7) {
      System.out.println("Expected number of arguments: 7 or 8 : received:"
          + args.length);
      printMessage();
    }
    int index = 0;
    String input = args[index++];
    String clusters = args[index++];
    String output = args[index++];
    String measureClass = args[index++];
    double convergenceDelta = Double.parseDouble(args[index++]);
    int maxIterations = new Integer(args[index++]);
    float m = Float.parseFloat(args[index++]);
    boolean doClustering = false;
    if (args.length > 7)
      doClustering = Boolean.parseBoolean(args[index++]);
    if (doClustering) {
      runClustering(input, clusters, output, measureClass, Double
          .toString(convergenceDelta), 500, m);
    } else {
      runJob(input, clusters, output, measureClass, convergenceDelta,
          maxIterations, 10, 10, m);

    }

  }

  /**
   * Run the job using supplied arguments
   *
   * @param input the directory pathname for input points
   * @param clustersIn the directory pathname for initial & computed clusters
   * @param output the directory pathname for output points
   * @param measureClass the classname of the DistanceMeasure
   * @param convergenceDelta the convergence delta value
   * @param maxIterations the maximum number of iterations
   * @param numMapTasks the number of mapper tasks
   */
  public static void runJob(String input, String clustersIn, String output,
      String measureClass, double convergenceDelta, int maxIterations,
      int numMapTasks, int numReduceTasks, float m) {

    boolean converged = false;
    int iteration = 0;
    String delta = Double.toString(convergenceDelta);

    // iterate until the clusters converge
    while (!converged && iteration < maxIterations) {
      log.info("Iteration {" + iteration + '}');

      // point the output to a new directory per iteration
      String clustersOut = output + File.separator + "clusters-" + iteration;
      converged = runIteration(input, clustersIn, clustersOut, measureClass,
          delta, numMapTasks, numReduceTasks, iteration, m);

      // now point the input to the old output directory
      clustersIn = output + File.separator + "clusters-" + iteration;
      iteration++;
    }

    // now actually cluster the points
    log.info("Clustering ");

    runClustering(input, clustersIn, output + File.separator + "points",
        measureClass, delta, numMapTasks, m);
  }

  /**
   * Run the job using supplied arguments
   *
   * @param input the directory pathname for input points
   * @param clustersIn the directory pathname for iniput clusters
   * @param clustersOut the directory pathname for output clusters
   * @param measureClass the classname of the DistanceMeasure
   * @param convergenceDelta the convergence delta value
   * @param numMapTasks the number of map tasks
   * @param iterationNumber the iteration number that is going to run
   * @param m
   * @return true if the iteration successfully runs
   */
  private static boolean runIteration(String input, String clustersIn,
      String clustersOut, String measureClass, String convergenceDelta,
      int numMapTasks, int numReduceTasks, int iterationNumber, float m) {

    JobConf conf = new JobConf(FuzzyKMeansJob.class);
    conf.setJobName("Fuzzy K Means{" + iterationNumber + '}');

    conf.setOutputKeyClass(Text.class);
    conf.setOutputValueClass(Text.class);

    FileInputFormat.setInputPaths(conf, new Path(input));
    Path outPath = new Path(clustersOut);
    FileOutputFormat.setOutputPath(conf, outPath);

    conf.setMapperClass(FuzzyKMeansMapper.class);
    conf.setCombinerClass(FuzzyKMeansCombiner.class);
    conf.setReducerClass(FuzzyKMeansReducer.class);
    conf.setNumMapTasks(numMapTasks);
    conf.setNumReduceTasks(numReduceTasks);
   
    conf.set(SoftCluster.CLUSTER_PATH_KEY, clustersIn);
    conf.set(SoftCluster.DISTANCE_MEASURE_KEY, measureClass);
    conf.set(SoftCluster.CLUSTER_CONVERGENCE_KEY, convergenceDelta);
    conf.set(SoftCluster.M_KEY, String.valueOf(m));

    // uncomment it to run locally
    // conf.set("mapred.job.tracker", "local");

    try {
      JobClient.runJob(conf);
      FileSystem fs = FileSystem.get(conf);
      return isConverged(clustersOut, conf, fs);
    } catch (IOException e) {
      log.warn(e.toString(), e);
      return true;
    }
  }

  /**
   * Run the job using supplied arguments
   *
   * @param input the directory pathname for input points
   * @param clustersIn the directory pathname for input clusters
   * @param output the directory pathname for output points
   * @param measureClass the classname of the DistanceMeasure
   * @param convergenceDelta the convergence delta value
   * @param numMapTasks the number of map tasks
   */
  private static void runClustering(String input, String clustersIn,
      String output, String measureClass, String convergenceDelta,
      int numMapTasks, float m) {

    JobConf conf = new JobConf(FuzzyKMeansDriver.class);
    conf.setJobName("Fuzzy K Means Clustering");

    conf.setOutputKeyClass(Text.class);
    conf.setOutputValueClass(Text.class);

    FileInputFormat.setInputPaths(conf, new Path(input));
    Path outPath = new Path(output);
    FileOutputFormat.setOutputPath(conf, outPath);

    conf.setMapperClass(FuzzyKMeansClusterMapper.class);

    // uncomment it to run locally
    // conf.set("mapred.job.tracker", "local");
    conf.setNumMapTasks(numMapTasks);
    conf.setNumReduceTasks(0);
    conf.set(SoftCluster.CLUSTER_PATH_KEY, clustersIn);
    conf.set(SoftCluster.DISTANCE_MEASURE_KEY, measureClass);
    conf.set(SoftCluster.CLUSTER_CONVERGENCE_KEY, convergenceDelta);
    conf.set(SoftCluster.M_KEY, String.valueOf(m));
    try {
      JobClient.runJob(conf);
    } catch (IOException e) {
      log.warn(e.toString(), e);
    }
  }

  /**
   * Return if all of the Clusters in the filePath have converged or not
   *
   * @param filePath the file path to the single file containing the clusters
   * @param conf the JobConf
   * @param fs the FileSystem
   * @return true if all Clusters are converged
   * @throws IOException if there was an IO error
   */
  private static boolean isConverged(String filePath, JobConf conf,
      FileSystem fs) throws IOException {

    Path clusterPath = new Path(filePath);
    List<Path> result = new ArrayList<Path>();

    PathFilter clusterFileFilter = new PathFilter() {
      @Override
      public boolean accept(Path path) {
        return path.getName().startsWith("part");
      }
    };

    FileStatus[] matches = fs.listStatus(FileUtil.stat2Paths(fs.globStatus(
        clusterPath, clusterFileFilter)), clusterFileFilter);

    for (FileStatus match : matches) {
      result.add(fs.makeQualified(match.getPath()));
    }
    boolean converged = true;

    for (Path p : result) {

      KeyValueLineRecordReader reader = null;

      try {
        reader = new KeyValueLineRecordReader(conf, new FileSplit(p, 0, fs
            .getFileStatus(p).getLen(), (String[]) null));
        Text key = new Text();
        Text value = new Text();
        while (converged && reader.next(key, value)) {
          converged = value.toString().charAt(0) == 'V';
        }
      } finally {
        if (reader != null) {
          reader.close();
        }
      }
    }

    return converged;
  }
}
TOP

Related Classes of org.apache.mahout.clustering.fuzzykmeans.FuzzyKMeansDriver

TOP
Copyright © 2018 www.massapi.com. All rights reserved.
All source code are property of their respective owners. Java is a trademark of Sun Microsystems, Inc and owned by ORACLE Inc. Contact coftware#gmail.com.