/**
* Licensed to the Apache Software Foundation (ASF) under one
* or more contributor license agreements. See the NOTICE file
* distributed with this work for additional information
* regarding copyright ownership. The ASF licenses this file
* to you under the Apache License, Version 2.0 (the
* "License"); you may not use this file except in compliance
* with the License. You may obtain a copy of the License at
*
* http://www.apache.org/licenses/LICENSE-2.0
*
* Unless required by applicable law or agreed to in writing, software
* distributed under the License is distributed on an "AS IS" BASIS,
* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
* See the License for the specific language governing permissions and
* limitations under the License.
*/
package org.apache.hadoop.zebra.mapred;
import java.io.DataInput;
import java.io.DataOutput;
import java.io.File;
import java.io.IOException;
import java.util.ArrayList;
import java.util.Comparator;
import java.util.HashMap;
import java.util.Iterator;
import java.util.Map;
import java.util.PriorityQueue;
import java.util.StringTokenizer;
import java.util.TreeMap;
import junit.framework.Assert;
import junit.framework.TestCase;
import org.apache.commons.logging.Log;
import org.apache.commons.logging.LogFactory;
import org.apache.hadoop.conf.Configuration;
import org.apache.hadoop.hdfs.MiniDFSCluster;
import org.apache.hadoop.fs.FileSystem;
import org.apache.hadoop.fs.Path;
import org.apache.hadoop.io.BytesWritable;
import org.apache.hadoop.io.IntWritable;
import org.apache.hadoop.io.LongWritable;
import org.apache.hadoop.io.RawComparator;
import org.apache.hadoop.io.Text;
import org.apache.hadoop.io.Writable;
import org.apache.hadoop.io.file.tfile.Utils;
import org.apache.hadoop.mapred.FileInputFormat;
import org.apache.hadoop.mapred.FileSplit;
import org.apache.hadoop.mapred.JobClient;
import org.apache.hadoop.mapred.JobConf;
import org.apache.hadoop.mapred.Mapper;
import org.apache.hadoop.mapred.MiniMRCluster;
import org.apache.hadoop.mapred.OutputCollector;
import org.apache.hadoop.mapred.Reducer;
import org.apache.hadoop.mapred.Reporter;
import org.apache.hadoop.mapred.TextInputFormat;
import org.apache.hadoop.zebra.io.BasicTable;
import org.apache.hadoop.zebra.io.TableScanner;
import org.apache.hadoop.zebra.mapred.BasicTableOutputFormat;
import org.apache.hadoop.zebra.mapred.TableInputFormat;
import org.apache.hadoop.zebra.mapred.ArticleGenerator.Summary;
import org.apache.hadoop.zebra.mapred.TestBasicTableIOFormatLocalFS.FreqWordCache.Item;
import org.apache.hadoop.zebra.types.ParseException;
import org.apache.hadoop.zebra.types.Projection;
import org.apache.hadoop.zebra.types.Schema;
import org.apache.hadoop.zebra.types.TypesUtils;
import org.apache.pig.backend.executionengine.ExecException;
import org.apache.pig.data.DataBag;
import org.apache.pig.data.Tuple;
/**
* Testing BasicTableOutputFormat and TableInputFormat using Local FS
*/
public class TestBasicTableIOFormatLocalFS extends TestCase {
static class Options {
int taskTrackers = 4;
int dataNodes = 4;
int srcFiles = 7;
int numBatches = 1;
int srcFileLen = 2 * 1024; // 20KB
int numMapper = 3;
int numReducer = 2;
int numFreqWords = 10;
long minTableSplitSize = 16 * 1024L;
boolean localFS = true;
String compression = "none";
String rootPath = "TestBasicTableIOFormat";
String srcPath = "docs";
String fwdIndexRootPath = "fwdIndex";
String invIndexTablePath = "invIndex";
String freqWordTablePath = "freqWord";
}
static Log LOG = LogFactory.getLog(TestBasicTableIOFormatLocalFS.class
.getName());
Options options;
Configuration conf;
MiniDFSCluster dfs;
FileSystem fileSys;
Path rootPath;
Path srcPath;
Path fwdIndexRootPath;
Path invIndexTablePath;
Path freqWordTablePath;
MiniMRCluster mr;
ArticleGenerator articalGen;
Map<String, Summary> summary;
@Override
protected void setUp() throws IOException {
if (System.getProperty("hadoop.log.dir") == null) {
String base = new File(".").getPath(); // getAbsolutePath();
Path path = new Path(".");
System
.setProperty("hadoop.log.dir", new Path(base).toString() + "./logs");
}
if (options == null) {
options = new Options();
}
if (conf == null) {
conf = new Configuration();
}
articalGen = new ArticleGenerator(1000, 1, 20, 100);
summary = new HashMap<String, Summary>();
if (options.localFS) {
Path localFSRootPath = new Path(System.getProperty("test.build.data",
"build/test/data/work-dir"));
fileSys = localFSRootPath.getFileSystem(conf);
rootPath = new Path(localFSRootPath, options.rootPath);
mr = new MiniMRCluster(options.taskTrackers, "file:///", 3);
} else {
dfs = new MiniDFSCluster(conf, options.dataNodes, true, null);
fileSys = dfs.getFileSystem();
rootPath = new Path(options.rootPath);
mr = new MiniMRCluster(options.taskTrackers, fileSys.getUri().toString(),
1);
}
conf = getJobConf("TestBasicTableIOFormat");
srcPath = new Path(rootPath, options.srcPath);
fwdIndexRootPath = new Path(rootPath, options.fwdIndexRootPath);
invIndexTablePath = new Path(rootPath, options.invIndexTablePath);
freqWordTablePath = new Path(rootPath, options.freqWordTablePath);
}
@Override
protected void tearDown() throws IOException {
if (mr != null) {
mr.shutdown();
}
if (dfs != null) {
dfs.shutdown();
}
}
JobConf getJobConf(String name) {
JobConf jobConf = mr.createJobConf();
jobConf.setJobName(name);
jobConf.set("table.output.tfile.compression", options.compression);
jobConf.setInt("mapred.app.freqWords.count", options.numFreqWords);
return jobConf;
}
/**
* Create a bunch of text files under a sub-directory of the srcPath. The name
* of the sub-directory is named after the batch name.
*
* @param batchName
* The batch name.
* @throws IOException
*/
void createSourceFiles(String batchName) throws IOException {
LOG.info("Creating source data folder: " + batchName);
Path batchDir = new Path(srcPath, batchName);
LOG.info("Cleaning directory: " + batchName);
fileSys.delete(batchDir, true);
LOG.info("Generating input files: " + batchName);
articalGen.batchArticalCreation(fileSys, new Path(srcPath, batchName),
"doc-", options.srcFiles, options.srcFileLen);
Summary s = articalGen.getSummary();
// dumpSummary(s);
long tmp = 0;
for (Iterator<Long> it = s.wordCntDist.values().iterator(); it.hasNext(); tmp += it
.next())
;
Assert.assertEquals(tmp, s.wordCount);
summary.put(batchName, s);
articalGen.resetSummary();
}
/**
* Generate forward index. Map-only task.
*
* <pre>
* Map Input:
* K = LongWritable (byte offset)
* V = Text (text line)
* Map Output:
* K = word: String.
* V = Tuple of {fileName:String, wordPos:Integer, lineNo:Integer }
* </pre>
*/
static class ForwardIndexGen {
static class MapClass implements
Mapper<LongWritable, Text, BytesWritable, Tuple> {
private BytesWritable outKey;
private Tuple outRow;
// index into the output tuple for fileName, wordPos, lineNo.
private int idxFileName, idxWordPos, idxLineNo;
private String filePath;
// maintain line number and word position.
private int lineNo = 0;
private int wordPos = 0;
@Override
public void map(LongWritable key, Text value,
OutputCollector<BytesWritable, Tuple> output, Reporter reporter)
throws IOException {
if (filePath == null) {
FileSplit split = (FileSplit) reporter.getInputSplit();
filePath = split.getPath().toString();
}
String line = value.toString();
StringTokenizer st = new StringTokenizer(line, " ");
while (st.hasMoreElements()) {
byte[] word = st.nextToken().getBytes();
outKey.set(word, 0, word.length);
TypesUtils.resetTuple(outRow);
try {
outRow.set(idxFileName, filePath);
outRow.set(idxWordPos, new Integer(wordPos));
outRow.set(idxLineNo, new Integer(lineNo));
output.collect(outKey, outRow);
} catch (ExecException e) {
e.printStackTrace();
}
++wordPos;
}
++lineNo;
}
@Override
public void configure(JobConf job) {
LOG.info("ForwardIndexGen.MapClass.configure");
outKey = new BytesWritable();
try {
Schema outSchema = BasicTableOutputFormat.getSchema(job);
outRow = TypesUtils.createTuple(outSchema);
idxFileName = outSchema.getColumnIndex("fileName");
idxWordPos = outSchema.getColumnIndex("wordPos");
idxLineNo = outSchema.getColumnIndex("lineNo");
} catch (IOException e) {
throw new RuntimeException("Schema parsing failed : "
+ e.getMessage());
} catch (ParseException e) {
throw new RuntimeException("Schema parsing failed : "
+ e.getMessage());
}
}
@Override
public void close() throws IOException {
// no-op
}
}
}
/**
* Run forward index generation.
*
* @param batchName
* @throws IOException
*/
void runForwardIndexGen(String batchName) throws IOException, ParseException {
LOG.info("Run Map-only job to convert source data to forward index: "
+ batchName);
JobConf jobConf = getJobConf("fwdIndexGen-" + batchName);
// input-related settings
jobConf.setInputFormat(TextInputFormat.class);
jobConf.setMapperClass(ForwardIndexGen.MapClass.class);
FileInputFormat.setInputPaths(jobConf, new Path(srcPath, batchName));
jobConf.setNumMapTasks(options.numMapper);
// output related settings
Path outPath = new Path(fwdIndexRootPath, batchName);
fileSys.delete(outPath, true);
jobConf.setOutputFormat(BasicTableOutputFormat.class);
BasicTableOutputFormat.setOutputPath(jobConf, outPath);
BasicTableOutputFormat.setSchema(jobConf, "fileName, wordPos, lineNo");
// set map-only job.
jobConf.setNumReduceTasks(0);
JobClient.runJob(jobConf);
}
/**
* Count the # of rows of a BasicTable
*
* @param tablePath
* The path to the BasicTable
* @return Number of rows.
* @throws IOException
*/
long countRows(Path tablePath) throws IOException, ParseException {
BasicTable.Reader reader = new BasicTable.Reader(tablePath, conf);
reader.setProjection("");
long totalRows = 0;
TableScanner scanner = reader.getScanner(null, true);
for (; !scanner.atEnd(); scanner.advance()) {
++totalRows;
}
scanner.close();
return totalRows;
}
/**
* Given a batch ID, return the batch name.
*
* @param i
* batch ID
* @return Batch name.
*/
static String batchName(int i) {
return String.format("batch-%03d", i);
}
/**
* Inverted index for one word.
*/
static class InvIndex implements Writable {
int count = 0;
// a map from filePath to all occurrences of the word positions.
Map<String, ArrayList<Integer>> index;
InvIndex() {
index = new TreeMap<String, ArrayList<Integer>>();
}
InvIndex(String fileName, int pos) {
this();
add(fileName, pos);
}
void add(String fileName, int pos) {
++count;
ArrayList<Integer> list = index.get(fileName);
if (list == null) {
list = new ArrayList<Integer>(1);
index.put(fileName, list);
}
list.add(pos);
}
void add(String fileName, ArrayList<Integer> positions) {
ArrayList<Integer> list = index.get(fileName);
if (list == null) {
list = new ArrayList<Integer>();
index.put(fileName, list);
}
count += positions.size();
list.ensureCapacity(list.size() + positions.size());
list.addAll(positions);
}
void reduce(InvIndex other) {
for (Iterator<Map.Entry<String, ArrayList<Integer>>> it = other.index
.entrySet().iterator(); it.hasNext();) {
Map.Entry<String, ArrayList<Integer>> e = it.next();
add(e.getKey(), e.getValue());
}
}
@Override
public void readFields(DataInput in) throws IOException {
count = 0;
index.clear();
count = Utils.readVInt(in);
if (count > 0) {
int n = Utils.readVInt(in);
for (int i = 0; i < n; ++i) {
String fileName = Utils.readString(in);
int m = Utils.readVInt(in);
ArrayList<Integer> list = new ArrayList<Integer>(m);
index.put(fileName, list);
for (int j = 0; j < m; ++j) {
list.add(Utils.readVInt(in));
}
}
}
}
@Override
public void write(DataOutput out) throws IOException {
Utils.writeVInt(out, count);
if (count > 0) {
Utils.writeVInt(out, index.size());
for (Iterator<Map.Entry<String, ArrayList<Integer>>> it = index
.entrySet().iterator(); it.hasNext();) {
Map.Entry<String, ArrayList<Integer>> e = it.next();
Utils.writeString(out, e.getKey());
ArrayList<Integer> list = e.getValue();
Utils.writeVInt(out, list.size());
for (Iterator<Integer> it2 = list.iterator(); it2.hasNext();) {
Utils.writeVInt(out, it2.next());
}
}
}
}
}
/**
* Generate inverted Index.
*
* <pre>
* Mapper Input =
* K: BytesWritable word;
* V: Tuple { fileName:String, wordPos:Integer };
*
* Mapper Output =
* K: BytesWritable word;
* V: InvIndex;
*
* Reducer Output =
* K: BytesWritable word;
* V: Tuple {count:Integer, index: Map of {fileName:String, Bag of {wordPos:Integer}}};
* </pre>
*/
static class InvertedIndexGen {
static class MapClass implements
Mapper<BytesWritable, Tuple, BytesWritable, InvIndex> {
// index of fileName and wordPos fileds of the input tuple
int idxFileName, idxWordPos;
@Override
public void map(BytesWritable key, Tuple value,
OutputCollector<BytesWritable, InvIndex> output, Reporter reporter)
throws IOException {
try {
String fileName = (String) value.get(idxFileName);
int wordPos = (Integer) value.get(idxWordPos);
output.collect(key, new InvIndex(fileName, wordPos));
} catch (ExecException e) {
e.printStackTrace();
}
}
@Override
public void configure(JobConf job) {
LOG.info("InvertedIndexGen.MapClass.configure");
String projection;
try {
projection = TableInputFormat.getProjection(job);
} catch (ParseException e) {
throw new RuntimeException("Schema parsing failed : "
+ e.getMessage());
} catch (IOException e) {
throw new RuntimeException("TableInputFormat.getProjection", e);
}
idxFileName = Projection.getColumnIndex(projection, "fileName");
idxWordPos = Projection.getColumnIndex(projection, "wordPos");
}
@Override
public void close() throws IOException {
// no-op
}
}
static class CombinerClass implements
Reducer<BytesWritable, InvIndex, BytesWritable, InvIndex> {
@Override
public void reduce(BytesWritable key, Iterator<InvIndex> values,
OutputCollector<BytesWritable, InvIndex> output, Reporter reporter)
throws IOException {
InvIndex sum = new InvIndex();
for (; values.hasNext();) {
sum.reduce(values.next());
}
output.collect(key, sum);
}
@Override
public void configure(JobConf job) {
LOG.info("InvertedIndexGen.CombinerClass.configure");
}
@Override
public void close() throws IOException {
// no-op
}
}
static class ReduceClass implements
Reducer<BytesWritable, InvIndex, BytesWritable, Tuple> {
Tuple outRow;
int idxCount, idxIndex;
Schema wordPosSchema;
int idxWordPos;
@Override
public void configure(JobConf job) {
LOG.info("InvertedIndexGen.ReduceClass.configure");
try {
Schema outSchema = BasicTableOutputFormat.getSchema(job);
outRow = TypesUtils.createTuple(outSchema);
idxCount = outSchema.getColumnIndex("count");
idxIndex = outSchema.getColumnIndex("index");
wordPosSchema = new Schema("wordPos");
idxWordPos = wordPosSchema.getColumnIndex("wordPos");
} catch (IOException e) {
throw new RuntimeException("Schema parsing failed :" + e.getMessage());
} catch (ParseException e) {
throw new RuntimeException("Schema parsing failed :" + e.getMessage());
}
}
@Override
public void close() throws IOException {
// no-op
}
Map<String, DataBag> convertInvIndex(Map<String, ArrayList<Integer>> index)
throws IOException {
Map<String, DataBag> ret = new TreeMap<String, DataBag>();
for (Iterator<Map.Entry<String, ArrayList<Integer>>> it = index
.entrySet().iterator(); it.hasNext();) {
Map.Entry<String, ArrayList<Integer>> e = it.next();
DataBag bag = TypesUtils.createBag();
for (Iterator<Integer> it2 = e.getValue().iterator(); it2.hasNext();) {
Tuple tuple = TypesUtils.createTuple(wordPosSchema);
tuple.set(idxWordPos, it2.next());
bag.add(tuple);
}
ret.put(e.getKey(), bag);
}
return ret;
}
@Override
public void reduce(BytesWritable key, Iterator<InvIndex> values,
OutputCollector<BytesWritable, Tuple> output, Reporter reporter)
throws IOException {
InvIndex sum = new InvIndex();
for (; values.hasNext();) {
sum.reduce(values.next());
}
try {
outRow.set(idxCount, sum.count);
outRow.set(idxIndex, convertInvIndex(sum.index));
output.collect(key, outRow);
} catch (ExecException e) {
e.printStackTrace();
}
}
}
}
void runInvertedIndexGen() throws IOException, ParseException {
LOG.info("Converting forward index to inverted index");
JobConf jobConf = getJobConf("runInvertedIndexGen");
// input-related settings
jobConf.setInputFormat(TableInputFormat.class);
jobConf.setMapperClass(InvertedIndexGen.MapClass.class);
Path[] paths = new Path[options.numBatches];
for (int i = 0; i < options.numBatches; ++i) {
paths[i] = new Path(fwdIndexRootPath, batchName(i));
}
TableInputFormat.setInputPaths(jobConf, paths);
// TableInputFormat.setProjection(jobConf, "fileName, wordPos");
TableInputFormat.setMinSplitSize(jobConf, options.minTableSplitSize);
jobConf.setNumMapTasks(options.numMapper);
jobConf.setMapOutputKeyClass(BytesWritable.class);
jobConf.setMapOutputValueClass(InvIndex.class);
// output related settings
fileSys.delete(invIndexTablePath, true);
jobConf.setOutputFormat(BasicTableOutputFormat.class);
jobConf.setReducerClass(InvertedIndexGen.ReduceClass.class);
jobConf.setCombinerClass(InvertedIndexGen.CombinerClass.class);
BasicTableOutputFormat.setOutputPath(jobConf, invIndexTablePath);
BasicTableOutputFormat.setSchema(jobConf, "count, index");
jobConf.setNumReduceTasks(options.numReducer);
JobClient.runJob(jobConf);
}
void reduce(Summary sum, Summary delta) {
sum.lineCount += delta.lineCount;
sum.wordCount += delta.wordCount;
reduce(sum.wordCntDist, delta.wordCntDist);
}
void reduce(Map<String, Long> sum, Map<String, Long> delta) {
for (Iterator<Map.Entry<String, Long>> it = delta.entrySet().iterator(); it
.hasNext();) {
Map.Entry<String, Long> e = it.next();
String key = e.getKey();
Long base = sum.get(key);
sum.put(key, (base == null) ? e.getValue() : base + e.getValue());
}
}
void dumpSummary(Summary s) {
LOG.info("Dumping Summary");
LOG.info("Word Count: " + s.wordCount);
for (Iterator<Map.Entry<String, Long>> it = s.wordCntDist.entrySet()
.iterator(); it.hasNext();) {
Map.Entry<String, Long> e = it.next();
LOG.info(e.getKey() + "->" + e.getValue());
}
}
/**
* Verify the word counts from the invIndexTable is the same as collected from
* the ArticleGenerator.
*
* @throws IOException
*/
void verifyWordCount() throws IOException, ParseException {
Summary expected = new Summary();
for (Iterator<Summary> it = summary.values().iterator(); it.hasNext();) {
Summary e = it.next();
// dumpSummary(e);
reduce(expected, e);
}
// LOG.info("Dumping aggregated Summary");
// dumpSummary(expected);
Summary actual = new Summary();
BasicTable.Reader reader = new BasicTable.Reader(invIndexTablePath, conf);
reader.setProjection("count");
TableScanner scanner = reader.getScanner(null, true);
Tuple tuple = TypesUtils.createTuple(Projection.toSchema(scanner
.getProjection()));
BytesWritable key = new BytesWritable();
for (; !scanner.atEnd(); scanner.advance()) {
scanner.getKey(key);
scanner.getValue(tuple);
int count = 0;
try {
count = (Integer) tuple.get(0);
} catch (ExecException e) {
e.printStackTrace();
}
actual.wordCount += count;
String word = new String(key.get(), 0, key.getSize());
actual.wordCntDist.put(word, (long) count);
}
scanner.close();
// LOG.info("Dumping MR calculated Summary");
// dumpSummary(actual);
Assert.assertEquals(expected.wordCount, actual.wordCount);
Assert.assertEquals(expected.wordCntDist.size(), actual.wordCntDist.size());
for (Iterator<Map.Entry<String, Long>> it = expected.wordCntDist.entrySet()
.iterator(); it.hasNext();) {
Map.Entry<String, Long> e = it.next();
String word = e.getKey();
Long myCount = actual.wordCntDist.get(word);
Assert.assertFalse(word, myCount == null);
Assert.assertEquals(word, e.getValue(), myCount);
}
}
/**
* Caching the top K frequent words.
*/
static class FreqWordCache {
static class Item {
BytesWritable word;
int count;
Item(BytesWritable w, int c) {
word = new BytesWritable();
word.set(w.get(), 0, w.getSize());
count = c;
}
}
int k;
PriorityQueue<Item> words;
FreqWordCache(int k) {
if (k <= 0) {
throw new IllegalArgumentException("Expecting positive int");
}
this.k = k;
words = new PriorityQueue<Item>(k, new Comparator<Item>() {
@Override
public int compare(Item o1, Item o2) {
if (o1.count != o2.count) {
return o1.count - o2.count;
}
return -o1.word.compareTo(o2.word);
}
});
}
void add(BytesWritable word, int cnt) {
while ((words.size() >= k) && words.peek().count < cnt) {
words.poll();
}
if ((words.size() < k) || words.peek().count == cnt) {
words.add(new Item(word, cnt));
}
}
void add(Iterator<BytesWritable> itWords, int cnt) {
while ((words.size() >= k) && words.peek().count < cnt) {
words.poll();
}
if ((words.size() < k) || words.peek().count == cnt) {
for (; itWords.hasNext();) {
words.add(new Item(itWords.next(), cnt));
}
}
}
Item[] toArray() {
Item[] ret = new Item[words.size()];
for (int i = 0; i < ret.length; ++i) {
ret[i] = words.poll();
}
for (int i = 0; i < ret.length / 2; ++i) {
Item tmp = ret[i];
ret[i] = ret[ret.length - i - 1];
ret[ret.length - i - 1] = tmp;
}
return ret;
}
}
/**
* Get the most frequent words from inverted index. The mapper uses a priority
* queue to keep the top frequent words in memory and output the results in
* close().
*
* <pre>
* Mapper Input =
* K: BytesWritable word;
* V: Tuple { count:Integer }.
*
* Mapper Output =
* K: IntWritabl: count
* V: BytesWritable: word
*
* Reducer Output =
* K: BytesWritable word;
* V: Tuple { count:Integer }.
* </pre>
*/
static class FreqWords {
static int getFreqWordsCount(Configuration conf) {
return conf.getInt("mapred.app.freqWords.count", 100);
}
static class MapClass implements
Mapper<BytesWritable, Tuple, IntWritable, BytesWritable> {
int idxCount;
FreqWordCache freqWords;
IntWritable intWritable;
OutputCollector<IntWritable, BytesWritable> out;
@Override
public void map(BytesWritable key, Tuple value,
OutputCollector<IntWritable, BytesWritable> output, Reporter reporter)
throws IOException {
if (out == null)
out = output;
try {
int count = (Integer) value.get(idxCount);
freqWords.add(key, count);
reporter.progress();
} catch (ExecException e) {
e.printStackTrace();
}
}
@Override
public void configure(JobConf job) {
LOG.info("FreqWords.MapClass.configure");
String inSchema;
try {
inSchema = TableInputFormat.getProjection(job);
} catch (ParseException e) {
throw new RuntimeException("Projection parsing failed : "
+ e.getMessage());
} catch (IOException e) {
throw new RuntimeException("TableInputFormat.getprojection", e);
}
idxCount = Projection.getColumnIndex(inSchema, "count");
intWritable = new IntWritable();
freqWords = new FreqWordCache(getFreqWordsCount(job));
}
@Override
public void close() throws IOException {
Item[] items = freqWords.toArray();
for (Item i : items) {
intWritable.set(i.count);
out.collect(intWritable, i.word);
}
}
}
static class CombinerClass implements
Reducer<IntWritable, BytesWritable, IntWritable, BytesWritable> {
FreqWordCache freqWords;
OutputCollector<IntWritable, BytesWritable> out;
IntWritable intWritable;
@Override
public void configure(JobConf job) {
LOG.info("FreqWords.CombinerClass.configure");
freqWords = new FreqWordCache(getFreqWordsCount(job));
intWritable = new IntWritable();
}
@Override
public void close() throws IOException {
Item[] items = freqWords.toArray();
for (Item i : items) {
intWritable.set(i.count);
out.collect(intWritable, i.word);
}
}
@Override
public void reduce(IntWritable key, Iterator<BytesWritable> values,
OutputCollector<IntWritable, BytesWritable> output, Reporter reporter)
throws IOException {
if (out == null) {
out = output;
}
freqWords.add(values, key.get());
reporter.progress();
}
}
static class ReduceClass implements
Reducer<IntWritable, BytesWritable, BytesWritable, Tuple> {
FreqWordCache freqWords;
OutputCollector<BytesWritable, Tuple> out;
Tuple outRow;
int idxCount;
@Override
public void configure(JobConf job) {
LOG.info("FreqWords.ReduceClass.configure");
freqWords = new FreqWordCache(getFreqWordsCount(job));
try {
Schema outSchema = BasicTableOutputFormat.getSchema(job);
outRow = TypesUtils.createTuple(outSchema);
idxCount = outSchema.getColumnIndex("count");
} catch (IOException e) {
throw new RuntimeException("Schema parsing failed : "
+ e.getMessage());
} catch (ParseException e) {
throw new RuntimeException("Schema parsing failed : "
+ e.getMessage());
}
}
@Override
public void close() throws IOException {
Item[] items = freqWords.toArray();
for (Item i : items) {
try {
outRow.set(idxCount, new Integer(i.count));
out.collect(i.word, outRow);
} catch (ExecException e) {
e.printStackTrace();
}
}
}
@Override
public void reduce(IntWritable key, Iterator<BytesWritable> values,
OutputCollector<BytesWritable, Tuple> output, Reporter reporter)
throws IOException {
if (out == null) {
out = output;
}
freqWords.add(values, key.get());
reporter.progress();
}
}
}
static class InverseIntRawComparator implements RawComparator<IntWritable> {
IntWritable.Comparator comparator;
InverseIntRawComparator() {
comparator = new IntWritable.Comparator();
}
@Override
public int compare(byte[] b1, int s1, int l1, byte[] b2, int s2, int l2) {
return -comparator.compare(b1, s1, l1, b2, s2, l2);
}
@Override
public int compare(IntWritable o1, IntWritable o2) {
return -comparator.compare(o1, o2);
}
}
void runFreqWords() throws IOException, ParseException {
LOG.info("Find the most frequent words");
JobConf jobConf = getJobConf("runFreqWords");
// input-related settings
jobConf.setInputFormat(TableInputFormat.class);
jobConf.setMapperClass(FreqWords.MapClass.class);
TableInputFormat.setInputPaths(jobConf, invIndexTablePath);
TableInputFormat.setProjection(jobConf, "count");
TableInputFormat.setMinSplitSize(jobConf, options.minTableSplitSize);
// jobConf.setNumMapTasks(options.numMapper);
jobConf.setNumMapTasks(-1);
jobConf.setMapOutputKeyClass(IntWritable.class);
jobConf.setMapOutputValueClass(BytesWritable.class);
// Set customized output comparator.
jobConf.setOutputKeyComparatorClass(InverseIntRawComparator.class);
// output related settings
fileSys.delete(freqWordTablePath, true);
jobConf.setOutputFormat(BasicTableOutputFormat.class);
jobConf.setReducerClass(FreqWords.ReduceClass.class);
jobConf.setCombinerClass(FreqWords.CombinerClass.class);
BasicTableOutputFormat.setOutputPath(jobConf, freqWordTablePath);
BasicTableOutputFormat.setSchema(jobConf, "count");
jobConf.setNumReduceTasks(1);
JobClient.runJob(jobConf);
}
void printFreqWords() throws IOException, ParseException {
LOG.info("Printing the most frequent words");
BasicTable.Reader reader = new BasicTable.Reader(freqWordTablePath, conf);
TableScanner scanner = reader.getScanner(null, true);
BytesWritable key = new BytesWritable();
Schema schema = Projection.toSchema(scanner.getProjection());
int idxCount = schema.getColumnIndex("count");
Tuple value = TypesUtils.createTuple(schema);
for (; !scanner.atEnd(); scanner.advance()) {
scanner.getKey(key);
scanner.getValue(value);
try {
String word = new String(key.get(), 0, key.getSize());
int count = (Integer) value.get(idxCount);
LOG.info(String.format("%s\t%d", word, count));
} catch (ExecException e) {
e.printStackTrace();
}
}
scanner.close();
}
/**
* Testing BasicTableOutputFormat and TableInputFormat by running a sequence
* of MapReduce jobs.
*
* @throws IOException
*/
public void testBasicTable() throws IOException, ParseException {
LOG.info("testBasicTable");
LOG.info("testing BasicTableOutputFormat in Map-only job");
for (int i = 0; i < options.numBatches; ++i) {
String batchName = batchName(i);
createSourceFiles(batchName);
runForwardIndexGen(batchName);
LOG.info("Forward index conversion complete: " + batchName);
Assert.assertEquals(summary.get(batchName).wordCount, countRows(new Path(
fwdIndexRootPath, batchName)));
}
runInvertedIndexGen();
verifyWordCount();
runFreqWords();
printFreqWords();
}
}