package ivory.lsh.eval;
import ivory.core.RetrievalEnvironment;
import ivory.core.data.document.WeightedIntDocVector;
import ivory.lsh.driver.PwsimEnvironment;
import ivory.lsh.eval.SampleSignatures.mapoutput;
import java.io.IOException;
import java.net.URI;
import java.util.Iterator;
import java.util.SortedMap;
import java.util.Map.Entry;
import org.apache.commons.cli.CommandLine;
import org.apache.commons.cli.CommandLineParser;
import org.apache.commons.cli.GnuParser;
import org.apache.commons.cli.HelpFormatter;
import org.apache.commons.cli.OptionBuilder;
import org.apache.commons.cli.Options;
import org.apache.commons.cli.ParseException;
import org.apache.hadoop.conf.Configured;
import org.apache.hadoop.fs.FSDataOutputStream;
import org.apache.hadoop.fs.FileSystem;
import org.apache.hadoop.fs.Path;
import org.apache.hadoop.io.IntWritable;
import org.apache.hadoop.io.Text;
import org.apache.hadoop.io.Writable;
import org.apache.hadoop.io.WritableComparable;
import org.apache.hadoop.mapred.FileInputFormat;
import org.apache.hadoop.mapred.FileOutputFormat;
import org.apache.hadoop.mapred.JobClient;
import org.apache.hadoop.mapred.JobConf;
import org.apache.hadoop.mapred.MapReduceBase;
import org.apache.hadoop.mapred.Mapper;
import org.apache.hadoop.mapred.OutputCollector;
import org.apache.hadoop.mapred.Reducer;
import org.apache.hadoop.mapred.Reporter;
import org.apache.hadoop.mapred.SequenceFileInputFormat;
import org.apache.hadoop.mapred.SequenceFileOutputFormat;
import org.apache.hadoop.mapreduce.filecache.DistributedCache;
import org.apache.hadoop.util.LineReader;
import org.apache.hadoop.util.Tool;
import org.apache.hadoop.util.ToolRunner;
import org.apache.log4j.Level;
import org.apache.log4j.Logger;
import edu.umd.cloud9.io.SequenceFileUtils;
import edu.umd.cloud9.io.map.HMapIIW;
import edu.umd.cloud9.util.map.HMapII;
/**
* <p>
* A program that samples from a collection of key,value pairs according to a given frequency.
* </p>
*
* <ul>
* <li>[input] path to the collection
* <li>[output-dir] path of the output file containing sample
* <li>[num-mappers] number of mappers to run
* <li>[sample-frequency] if entered N, then every Nth <key,value> pair is sampled. N=1 is
* equivalent to sampling everything.
* </ul>
*
* <p>
* User needs to modify the source file to change the key and value class type. Change input and
* output class type of the mapper, and modify the 3 static fields accordingly.
* </p>
* <p>
* Here's a sample invocation:
* </p>
*
* <blockquote>
*
* <pre>
* hadoop jar ivory.jar ivory.util.SampleDocVectors
* /umd-lin/fture/pwsim/medline/wt-int-doc-vectors
* /umd-lin/fture/pwsim/medline/wt-int-doc-vectors-sample
* 100
* </pre>
*
* <p>
* If there is a text file containing docnos to be sampled (one docno per line), this should be
* specified as the fifth and last argument. In this case, the sample frequency argument can be
* anything since it will be ignored.
* </p>
*
* </blockquote>
*
*
* usage: [input] [output-dir] [number-of-mappers] [sample-sampleFreq] ([sample-docnos-path])
*
* @author ferhanture
*
*/
@SuppressWarnings("deprecation")
public class SampleIntDocVectors extends Configured implements Tool {
@SuppressWarnings("unchecked")
static Class keyClass = IntWritable.class, valueClass = WeightedIntDocVector.class,
inputFormat = SequenceFileInputFormat.class;
private static final Logger sLogger = Logger.getLogger(SampleIntDocVectors.class);
private void printUsage() {
HelpFormatter formatter = new HelpFormatter();
formatter.printHelp( this.getClass().getCanonicalName(), options );
}
private static class MyMapper extends MapReduceBase implements
Mapper<IntWritable, WeightedIntDocVector, IntWritable, WeightedIntDocVector> {
private int sampleFreq;
private HMapII samplesMap = null;
private String getFilename(String s) {
return s.substring(s.lastIndexOf("/") + 1);
}
private HMapIIW readSamplesFromCache(String samplesFile, JobConf conf) throws IOException {
Path[] localFiles = DistributedCache.getLocalCacheFiles(conf);
HMapIIW samplesMap = null;
for (Path localFile : localFiles) {
if (localFile.toString().contains(samplesFile)) {
samplesMap = new HMapIIW();
LineReader reader = new LineReader(FileSystem.getLocal(conf).open(localFile));
Text t = new Text();
while (reader.readLine(t) != 0) {
int docno = Integer.parseInt(t.toString());
sLogger.info(docno + " --> sample");
samplesMap.put(docno, 1);
}
reader.close();
sLogger.info(samplesMap.size() + " sampled");
}
}
if (samplesMap == null) throw new RuntimeException("Not found in local cache: " + samplesFile);
return samplesMap;
}
public void configure(JobConf conf) {
sLogger.setLevel(Level.INFO);
sampleFreq = conf.getInt("SampleFrequency", -1);
// read doc ids of sample into vectors
String samplesFile = conf.get("Ivory.SampleFile");
if (samplesFile != null) {
try {
samplesMap = readSamplesFromCache(getFilename(samplesFile), conf);
} catch (NumberFormatException e) {
e.printStackTrace();
throw new RuntimeException("Incorrect format in " + samplesFile);
} catch (IOException e) {
e.printStackTrace();
throw new RuntimeException("I/O error in " + samplesFile);
} catch (Exception e) {
e.printStackTrace();
throw new RuntimeException("Error reading sample file!");
}
}
}
public void map(IntWritable key, WeightedIntDocVector val,
OutputCollector<IntWritable, WeightedIntDocVector> output, Reporter reporter)
throws IOException {
if (samplesMap != null) {
if (samplesMap.containsKey(key.get())) {
reporter.incrCounter(mapoutput.count, 1);
output.collect(key, val);
}
} else {
int randInt = (int) (Math.random() * sampleFreq); // integer in [0,sampleFrq)
if (randInt == 0) {
output.collect(key, val);
}
}
}
}
public static class MyReducer extends MapReduceBase implements
Reducer<IntWritable, WeightedIntDocVector, IntWritable, WeightedIntDocVector> {
@Override
public void reduce(IntWritable key, Iterator<WeightedIntDocVector> values,
OutputCollector<IntWritable, WeightedIntDocVector> output, Reporter reporter)
throws IOException {
output.collect(key, values.next());
}
}
@SuppressWarnings("unchecked")
public int run(String[] args) throws Exception {
sLogger.setLevel(Level.INFO);
if ( parseArgs(args) < 0 ) {
printUsage();
System.exit(-1);
}
JobConf job = new JobConf(getConf(), SampleIntDocVectors.class);
FileSystem fs = FileSystem.get(job);
inputPath = (inputPath == null) ? PwsimEnvironment.getIntDocvectorsFile(workDir, fs) : inputPath;
outputPath = (outputPath == null) ? PwsimEnvironment.getIntDocvectorsFile(workDir, fs, sampleSize) : outputPath;
if (!fs.exists(new Path(inputPath))) {
throw new RuntimeException("Error, input path does not exist!");
}
job.setJobName(getClass().getName());
// if sample docnos path provided and frequency not provided
if (sampleDocnosFile != null && fs.exists(new Path(sampleDocnosFile))) {
job.set("Ivory.SampleFile", sampleDocnosFile);
DistributedCache.addCacheFile(new URI(sampleDocnosFile), job);
} else if (sampleSize != -1) {
RetrievalEnvironment env = new RetrievalEnvironment(workDir, fs);
int collectionSize = env.readCollectionDocumentCount();
sampleFreq = collectionSize / (float) sampleSize;
job.setInt("SampleFrequency", (int) sampleFreq);
} else {
throw new RuntimeException("Either provide sample frequency with " +
"option -" + SAMPLESIZE_OPTION+ " or existing sample docnos with option -" + SAMPLEDOCNOS_OPTION);
}
int numMappers = 100;
int numReducers = 1;
fs.delete(new Path(outputPath), true);
FileInputFormat.setInputPaths(job, new Path(inputPath));
FileOutputFormat.setOutputPath(job, new Path(outputPath));
FileOutputFormat.setCompressOutput(job, false);
job.setJarByClass(SampleIntDocVectors.class);
job.set("mapred.child.java.opts", "-Xmx2048m");
job.setInt("mapred.map.max.attempts", 100);
job.setInt("mapred.reduce.max.attempts", 100);
job.setInt("mapred.task.timeout", 600000000);
sLogger.info("Running job " + job.getJobName());
sLogger.info("Input directory: " + inputPath);
sLogger.info("Output directory: " + outputPath);
sLogger.info("Sample frequency: " + sampleFreq);
sLogger.info("Sample docnos: " + job.get("Ivory.SampleFile"));
job.setNumMapTasks(numMappers);
job.setNumReduceTasks(numReducers);
job.setInputFormat(inputFormat);
job.setMapOutputKeyClass(keyClass);
job.setMapOutputValueClass(valueClass);
job.setOutputKeyClass(keyClass);
job.setOutputValueClass(valueClass);
job.setMapperClass(MyMapper.class);
job.setReducerClass(MyReducer.class);
job.setOutputFormat(SequenceFileOutputFormat.class);
JobClient.runJob(job);
////////////////////////
if (sampleDocnosFile != null && !fs.exists(new Path(sampleDocnosFile))) {
sLogger.info("Extracting sample docnos from sampled vectors...");
SortedMap<WritableComparable, Writable> docno2DocVectors;
try{
docno2DocVectors = SequenceFileUtils.readFileIntoMap(new Path(outputPath+"/part-00000"));
FSDataOutputStream out = fs.create(new Path(sampleDocnosFile));
for(Entry<WritableComparable, Writable> entry : docno2DocVectors.entrySet()){
int docno = ((IntWritable) entry.getKey()).get();
out.writeBytes(docno+"\n");
}
out.close();
} catch (Exception e) {
throw new RuntimeException(e.toString());
}
}
return 0;
}
private Options options;
private String sampleDocnosFile, inputPath, outputPath, workDir;
private int sampleSize;
private float sampleFreq;
private static final String WORKDIR_PATH_OPTION = "index";
private static final String INPUT_PATH_OPTION = "input";
private static final String OUTPUT_PATH_OPTION = "output";
private static final String SAMPLEDOCNOS_OPTION = "docnos";
private static final String SAMPLESIZE_OPTION = "size";
private static final String LIBJARS_OPTION = "libjars";
@SuppressWarnings("static-access")
private int parseArgs(String[] args) {
options = new Options();
options.addOption(OptionBuilder.withDescription("path to directory with weighted integer doc vectors").withArgName("path").hasArg().isRequired().create(WORKDIR_PATH_OPTION));
options.addOption(OptionBuilder.withDescription("path to weighted integer doc vectors").withArgName("path").hasArg().create(INPUT_PATH_OPTION));
options.addOption(OptionBuilder.withDescription("path to sampled weighted integer doc vectors").withArgName("path").hasArg().create(OUTPUT_PATH_OPTION));
options.addOption(OptionBuilder.withDescription("only keep pairs that match these docnos").withArgName("path to sample docnos file").hasArg().create(SAMPLEDOCNOS_OPTION));
options.addOption(OptionBuilder.withDescription("sample a document with probability = number-of-docs/N").withArgName("N").hasArg().create(SAMPLESIZE_OPTION));
options.addOption(OptionBuilder.withDescription("Hadoop option to load external jars").withArgName("jar packages").hasArg().create(LIBJARS_OPTION));
CommandLine cmdline;
CommandLineParser parser = new GnuParser();
try {
cmdline = parser.parse(options, args);
} catch (ParseException exp) {
System.err.println("Error parsing command line: " + exp.getMessage());
return -1;
}
workDir = cmdline.getOptionValue(WORKDIR_PATH_OPTION);
inputPath = cmdline.hasOption(INPUT_PATH_OPTION) ? cmdline.getOptionValue(INPUT_PATH_OPTION) : null;
outputPath = cmdline.hasOption(OUTPUT_PATH_OPTION) ? cmdline.getOptionValue(OUTPUT_PATH_OPTION) : null;
sampleSize = cmdline.hasOption(SAMPLESIZE_OPTION) ? Integer.parseInt(cmdline.getOptionValue(SAMPLESIZE_OPTION)) : -1;
sampleDocnosFile = cmdline.hasOption(SAMPLEDOCNOS_OPTION) ? cmdline.getOptionValue(SAMPLEDOCNOS_OPTION) : null;
return 0;
}
public static void main(String[] args) throws Exception {
ToolRunner.run(new SampleIntDocVectors(), args);
return;
}
}