Package ivory.lsh.pwsim.cl

Source Code of ivory.lsh.pwsim.cl.CLSlidingWindowPwsim$MyReducer

package ivory.lsh.pwsim.cl;

import ivory.lsh.data.BitsSignatureTable;
import ivory.lsh.data.Signature;
import ivory.lsh.driver.PwsimEnvironment;
import java.io.IOException;
import java.net.URI;
import java.util.HashMap;
import java.util.Iterator;
import java.util.TreeSet;
import org.apache.commons.cli.CommandLine;
import org.apache.commons.cli.CommandLineParser;
import org.apache.commons.cli.GnuParser;
import org.apache.commons.cli.HelpFormatter;
import org.apache.commons.cli.OptionBuilder;
import org.apache.commons.cli.Options;
import org.apache.commons.cli.ParseException;
import org.apache.hadoop.conf.Configured;
import org.apache.hadoop.fs.FileSystem;
import org.apache.hadoop.fs.Path;
import org.apache.hadoop.io.IntWritable;
import org.apache.hadoop.io.Text;
import org.apache.hadoop.mapred.Counters;
import org.apache.hadoop.mapred.FileInputFormat;
import org.apache.hadoop.mapred.FileOutputFormat;
import org.apache.hadoop.mapred.JobClient;
import org.apache.hadoop.mapred.JobConf;
import org.apache.hadoop.mapred.MapReduceBase;
import org.apache.hadoop.mapred.Mapper;
import org.apache.hadoop.mapred.OutputCollector;
import org.apache.hadoop.mapred.Reducer;
import org.apache.hadoop.mapred.Reporter;
import org.apache.hadoop.mapred.RunningJob;
import org.apache.hadoop.mapred.SequenceFileInputFormat;
import org.apache.hadoop.mapred.SequenceFileOutputFormat;
import org.apache.hadoop.mapred.TextOutputFormat;
import org.apache.hadoop.mapreduce.filecache.DistributedCache;
import org.apache.hadoop.util.LineReader;
import org.apache.hadoop.util.Tool;
import org.apache.hadoop.util.ToolRunner;
import org.apache.log4j.Level;
import org.apache.log4j.Logger;
import edu.umd.cloud9.io.map.HMapIIW;
import edu.umd.cloud9.io.pair.PairOfInts;

/**
* Implementation of sliding window algorithm for cross-lingual pairwise similarity (see Ture et al,
* SIGIR'11 for details and pseudocode)
*
* @author ferhanture
*
*
*/
public class CLSlidingWindowPwsim extends Configured implements Tool {

  private static final Logger sLogger = Logger.getLogger(CLSlidingWindowPwsim.class);

  static enum Pairs {
    Processed, Emitted, PrefixSum
  }

  public static class MyMapper extends MapReduceBase implements
  Mapper<IntWritable, BitsSignatureTable, PairOfInts, IntWritable> {
    static int slidingWindowSize, maxDist;

    private Signature[] signatures = null;
    private int[] docNos = null;
    private HMapIIW samplesMap = null;
    private int hammingDistance;
    private PairOfInts outKey = new PairOfInts();
    private IntWritable outValue = new IntWritable();
    private int nSignatures = -1;

    private String getFilename(String s) {
      return s.substring(s.lastIndexOf("/") + 1);
    }

    public void configure(JobConf conf) {
      sLogger.setLevel(Level.INFO);

      slidingWindowSize = conf.getInt("Ivory.SlidingWindowSize", -1);
      maxDist = conf.getInt("Ivory.MaxHammingDistance", -1);

      // read doc ids of sample into vectors
      String samplesFile = conf.get("Ivory.SampleFile");
      if (samplesFile != null) {
        try {
          samplesMap = readSamplesFromCache(getFilename(samplesFile), conf);
        } catch (NumberFormatException e) {
          e.printStackTrace();
          throw new RuntimeException("Incorrect format in " + samplesFile);
        } catch (IOException e) {
          e.printStackTrace();
          throw new RuntimeException("I/O error in " + samplesFile);
        } catch (Exception e) {
          e.printStackTrace();
          throw new RuntimeException("Error reading sample file: " + samplesFile);
        }
      }

    }

    private HMapIIW readSamplesFromCache(String samplesFile, JobConf conf) throws IOException {
      Path[] localFiles = DistributedCache.getLocalCacheFiles(conf);
      HMapIIW samplesMap = null;
      for (Path localFile : localFiles) {
        if (localFile.toString().contains(samplesFile)) {
          samplesMap = new HMapIIW();
          LineReader reader = new LineReader(FileSystem.getLocal(conf).open(localFile));
          Text t = new Text();
          while (reader.readLine(t) != 0) {
            int docno = Integer.parseInt(t.toString());
            sLogger.info(docno + " --> sample");
            samplesMap.put(docno, 1);
          }
          reader.close();
          sLogger.info(samplesMap.size() + " sampled");
        }
      }
      if (samplesMap == null) throw new RuntimeException("Not found in local cache: " + samplesFile);
      return samplesMap;
    }

    public void map(IntWritable permNo, BitsSignatureTable signatureTable,
        OutputCollector<PairOfInts, IntWritable> output, Reporter reporter) throws IOException {
      signatures = signatureTable.getSignatures();
      docNos = signatureTable.getDocNos();
      nSignatures = signatureTable.getNumOfSignatures();
      for (int i = 0; i < nSignatures; i++) {
        if ((docNos[i] > 1000000000 && samplesMap == null)
            || (samplesMap != null && samplesMap.containsKey(docNos[i]))) {
          for (int j = i - 1; j > i - slidingWindowSize && j >= 0; j--) {
            if (docNos[j] > 1000000000) {
              continue;
            }
            int prefix = signatures[i].getLongestPrefix(signatures[j]);
            reporter.incrCounter(Pairs.PrefixSum, prefix);
            reporter.incrCounter(Pairs.Processed, 1);
            hammingDistance = signatures[i].hammingDistance(signatures[j], maxDist);
            sLogger.debug(hammingDistance);
            if (hammingDistance <= maxDist) {
              reporter.incrCounter(Pairs.Emitted, 1);

              outValue.set(hammingDistance);
              outKey.set(docNos[j], docNos[i]); // pair format: english docno first, then german
              // docno
              output.collect(outKey, outValue);
            }
          }
          for (int j = i + 1; j < i + slidingWindowSize && j < nSignatures; j++) {
            if (docNos[j] > 1000000000) {
              continue;
            }
            int prefix = signatures[i].getLongestPrefix(signatures[j]);
            reporter.incrCounter(Pairs.PrefixSum, prefix);
            reporter.incrCounter(Pairs.Processed, 1);
            hammingDistance = signatures[i].hammingDistance(signatures[j], maxDist);
            sLogger.debug(hammingDistance);
            if (hammingDistance <= maxDist) {
              reporter.incrCounter(Pairs.Emitted, 1);

              outValue.set(hammingDistance);
              outKey.set(docNos[j], docNos[i]);
              output.collect(outKey, outValue);
            }
          }
        }
      }
    }
  }

  // Use this Reducer class when the output of Mapper is (PairOfInts,IntWritable)
  // same as IdentityReducer? the goal is to get rid of duplicate key-value pairs
  public static class MyReducer extends MapReduceBase implements
  Reducer<PairOfInts, IntWritable, PairOfInts, IntWritable> {
    IntWritable outValue = new IntWritable();
    HashMap<String, Integer> map = new HashMap<String, Integer>();

    public void reduce(PairOfInts key, Iterator<IntWritable> val,
        OutputCollector<PairOfInts, IntWritable> output, Reporter reporter) throws IOException {
      output.collect(key, val.next());
      reporter.incrCounter(Pairs.Emitted, 1);
    }
  }

  // Use this Reducer class when the output of Mapper is (IntWritable,PairOfInts)
  // the goal is to output top N similar pairs for each key document
  public static class MyReducerTopN extends MapReduceBase implements
  Reducer<IntWritable, PairOfInts, IntWritable, PairOfInts> {
    int numResults;
    TreeSet<PairOfInts> list = new TreeSet<PairOfInts>();

    public void configure(JobConf conf) {
      numResults = conf.getInt("Ivory.NumResults", -1);
      sLogger.info("numResults");
    }

    public void reduce(IntWritable key, Iterator<PairOfInts> values,
        OutputCollector<IntWritable, PairOfInts> output, Reporter reporter) throws IOException {
      list.clear();
      while (values.hasNext()) {
        PairOfInts p = values.next();
        list.add(new PairOfInts(p.getLeftElement(), p.getRightElement()));
      }
      int cntr = 0;
      while (!list.isEmpty() && cntr < numResults) {
        output.collect(key, list.pollFirst());
        cntr++;
      }
    }
  }

  public int run(String[] args) throws Exception {
    if ( parseArgs(args) < 0 ) {
      printUsage();
      return -1;
    }

    JobConf job = new JobConf(getConf(), CLSlidingWindowPwsim.class);
    FileSystem fs = FileSystem.get(job);
    inputPath = inputPath == null ? PwsimEnvironment.getTablesDir(workDir, fs, signatureType, numOfBits, chunkOverlapSize, numOfPermutations) : inputPath;
    outputPath = outputPath == null ? PwsimEnvironment.getPwsimDir(workDir, signatureType, maxDist, numOfBits, numOfPermutations, windowSize) : outputPath;

    if (fs.exists(new Path(outputPath))) {
      sLogger.info("SlidingWindowPwsim output already exists! Quitting...\nPath: " + outputPath);
      return 0;
    }

    if (sampleDocnosFile != null) {
      DistributedCache.addCacheFile(new URI(sampleDocnosFile), job);
      job.set("Ivory.SampleFile", sampleDocnosFile);
    }

    String collectionName = job.get("Ivory.CollectionName");
    job.setJobName("SlidingWindowPwsim:" + collectionName + workDir.replaceFirst("tables", "")
        + "_B=" + windowSize + "_" + numResults);

    FileInputFormat.setInputPaths(job, new Path(inputPath));
    FileOutputFormat.setOutputPath(job, new Path(outputPath));

    FileOutputFormat.setCompressOutput(job, false);

    job.setJarByClass(CLSlidingWindowPwsim.class);

    job.set("mapreduce.map.java.opts", "-Xmx2000m");
    job.setInt("mapred.task.timeout", 60000000);
    job.setInt("Ivory.SlidingWindowSize", windowSize);
    job.setInt("Ivory.MaxHammingDistance", maxDist);
    job.setNumMapTasks(100);
    job.setNumReduceTasks(numReducers);
    job.setInputFormat(SequenceFileInputFormat.class);
    job.setMapOutputKeyClass(PairOfInts.class);
    job.setMapOutputValueClass(IntWritable.class);
    job.setOutputKeyClass(PairOfInts.class);
    job.setOutputValueClass(IntWritable.class);
    job.setMapperClass(MyMapper.class);
    if (numResults == -1) {
      job.setReducerClass(MyReducer.class);
    } else {
      job.setReducerClass(MyReducerTopN.class);
    }
    if (sampleDocnosFile == null) { // if sample file is provided, output should be text.
      job.setOutputFormat(SequenceFileOutputFormat.class);
    } else {
      job.setOutputFormat(TextOutputFormat.class);
    }

    sLogger.info("Running job " + job.getJobName() + "...");
    sLogger.info("Input path: " + workDir);
    sLogger.info("Output path: " + outputPath);
    sLogger.info("Window size: " + windowSize);
    sLogger.info("Threshold: " + maxDist);
    sLogger.info("Sample file?: " + ((sampleDocnosFile != null) ? sampleDocnosFile : "none"));
    sLogger.info("Number of results: " + (numResults == -1 ? "all" : numResults));

    long startTime = System.currentTimeMillis();
    RunningJob j = JobClient.runJob(job);
    System.out.println("Job finished in " + (System.currentTimeMillis() - startTime) / 1000.0 + " seconds");
   
    Counters counters = j.getCounters();
    long processed = (long) counters.findCounter(Pairs.Processed).getCounter();
    long prefixsum = (long) counters.findCounter(Pairs.PrefixSum).getCounter();
    System.out.println("Avg prefix length = " + (prefixsum / (float) processed));

    return 0;
  }

  private static final String WORKDIR_PATH_OPTION = "index";
  private static final String INPUT_PATH_OPTION = "input";
  private static final String OUTPUT_PATH_OPTION = "output";
  private static final String THRESHOLD_OPTION = "T";
  private static final String WINDOWSIZE_OPTION = "B";
  private static final String SIGNLENG_OPTION = "num_bits";
  private static final String NUMPERMS_OPTION = "Q";
  private static final String OVERLAPSIZE_OPTION = "overlap";
  private static final String SIGNTYPE_OPTION = "type";
  private static final String SAMPLEDOCNOS_OPTION = "docnos";
  private static final String NUMREDUCERS_OPTION = "reduce";
  private static final String TOPN_OPTION = "topN";
  private static final String LIBJARS_OPTION = "libjars";
  private Options options;
  private int numOfPermutations, chunkOverlapSize, numReducers, windowSize, maxDist, numResults, numOfBits;
  private String signatureType, sampleDocnosFile, workDir, inputPath, outputPath;

  private void printUsage() {
    HelpFormatter formatter = new HelpFormatter();
    formatter.printHelp( this.getClass().getCanonicalName(), options );
    //      System.out.println("usage: [input-path] [output-path] [window-size] [max-distance] ([sample-docnos])");
    //    return -1;
  }

  @SuppressWarnings("static-access")
  private int parseArgs(String[] args) {
    options = new Options();
    options.addOption(OptionBuilder.withDescription("path to index directory").withArgName("path").hasArg().create(WORKDIR_PATH_OPTION));
    options.addOption(OptionBuilder.withDescription("path to input (permuted tables)").withArgName("path").hasArg().create(INPUT_PATH_OPTION));
    options.addOption(OptionBuilder.withDescription("path to output").withArgName("path").hasArg().create(OUTPUT_PATH_OPTION));
    options.addOption(OptionBuilder.withDescription("hamming distance threshold for similar pairs").withArgName("threshold").hasArg().isRequired().create(THRESHOLD_OPTION));
    options.addOption(OptionBuilder.withDescription("only keep pairs that match these docnos").withArgName("path to sample docnos file").hasArg().create(SAMPLEDOCNOS_OPTION));
    options.addOption(OptionBuilder.withDescription("number of reducers").withArgName("number").hasArg().create(NUMREDUCERS_OPTION));
    options.addOption(OptionBuilder.withDescription("length of signature").withArgName("number of bits").hasArg().create(SIGNLENG_OPTION));
    options.addOption(OptionBuilder.withDescription("sliding window size").withArgName("window").hasArg().isRequired().create(WINDOWSIZE_OPTION));
    options.addOption(OptionBuilder.withDescription("type of signature").withArgName("random|minhash|simhash").hasArg().create(SIGNTYPE_OPTION));
    options.addOption(OptionBuilder.withDescription("number of permutations (tables)").withArgName("permutations").hasArg().create(NUMPERMS_OPTION));   
    options.addOption(OptionBuilder.withDescription("size of overlap between chunks (default: window size)").withArgName("overlap size").hasArg().create(OVERLAPSIZE_OPTION));
    options.addOption(OptionBuilder.withDescription("keep only N results for each source document").withArgName("N").hasArg().create(TOPN_OPTION));
    options.addOption(OptionBuilder.withDescription("Hadoop option to load external jars").withArgName("jar packages").hasArg().create(LIBJARS_OPTION));

    CommandLine cmdline;
    CommandLineParser parser = new GnuParser();
    try {
      cmdline = parser.parse(options, args);
    } catch (ParseException exp) {
      System.err.println("Error parsing command line: " + exp.getMessage());
      return -1;
    }

    workDir = cmdline.hasOption(WORKDIR_PATH_OPTION) ? cmdline.getOptionValue(WORKDIR_PATH_OPTION) : null;
    inputPath = cmdline.hasOption(INPUT_PATH_OPTION) ? cmdline.getOptionValue(INPUT_PATH_OPTION) : null;
    outputPath = cmdline.hasOption(OUTPUT_PATH_OPTION) ? cmdline.getOptionValue(OUTPUT_PATH_OPTION) : null;
    numOfBits = cmdline.hasOption(SIGNLENG_OPTION) ? Integer.parseInt(cmdline.getOptionValue(SIGNLENG_OPTION)) : -1;
    signatureType = cmdline.hasOption(SIGNTYPE_OPTION) ? cmdline.getOptionValue(SIGNTYPE_OPTION) : null;
    numOfPermutations = cmdline.hasOption(NUMPERMS_OPTION) ? Integer.parseInt(cmdline.getOptionValue(NUMPERMS_OPTION)) : -1;
    chunkOverlapSize = cmdline.hasOption(OVERLAPSIZE_OPTION) ? Integer.parseInt(cmdline.getOptionValue(OVERLAPSIZE_OPTION)) : -1;     

    // either work dir or input+output should be specified
    if (!((workDir != null && numOfBits > 0 && numOfPermutations > 0 && chunkOverlapSize > 0 && signatureType != null) || (inputPath != null && outputPath != null))) {
      System.err.println("Either options -" + WORKDIR_PATH_OPTION + " and -" + SIGNLENG_OPTION + " and -" + SIGNTYPE_OPTION + " and -" +
          NUMPERMS_OPTION + " and -" + OVERLAPSIZE_OPTION + " or options -" + INPUT_PATH_OPTION + " and -" + OUTPUT_PATH_OPTION + "should be specified!");
      return -1;
    }
    numReducers = cmdline.hasOption(NUMREDUCERS_OPTION) ? Integer.parseInt(cmdline.getOptionValue(NUMREDUCERS_OPTION)) : 100;
    windowSize = Integer.parseInt(cmdline.getOptionValue(WINDOWSIZE_OPTION));
    maxDist = Integer.parseInt(cmdline.getOptionValue(THRESHOLD_OPTION));
    numResults = cmdline.hasOption(TOPN_OPTION) ? Integer.parseInt(cmdline.getOptionValue(TOPN_OPTION)) : -1;
    sampleDocnosFile = cmdline.hasOption(SAMPLEDOCNOS_OPTION) ? cmdline.getOptionValue(SAMPLEDOCNOS_OPTION) : null;

    return 0;
  }

  public static void main(String[] args) throws Exception {
    ToolRunner.run(new CLSlidingWindowPwsim(), args);
  }
}
TOP

Related Classes of ivory.lsh.pwsim.cl.CLSlidingWindowPwsim$MyReducer

TOP
Copyright © 2018 www.massapi.com. All rights reserved.
All source code are property of their respective owners. Java is a trademark of Sun Microsystems, Inc and owned by ORACLE Inc. Contact coftware#gmail.com.