package com.manning.hip.ch12.crunch;
import com.cloudera.crunch.*;
import com.cloudera.crunch.type.PTypeFamily;
import com.manning.hip.common.ApacheCommonLogReader;
import com.manning.hip.common.CommonLogEntry;
import org.apache.commons.lang.StringUtils;
import org.apache.hadoop.mapreduce.MapContext;
import org.apache.hadoop.mapreduce.TaskInputOutputContext;
import org.apache.hadoop.mapreduce.lib.input.FileSplit;
import org.slf4j.Logger;
import org.slf4j.LoggerFactory;
import javax.annotation.Nullable;
import java.io.IOException;
import java.util.HashSet;
import java.util.Set;
public class CrunchUtils {
/**
* Returns the largest numerical element from the input collection.
*/
public static <K> PTable<K, String> uniqueValues(PTable<K, String> collect) {
return collect.groupByKey().combineValues(new CombineFn<K, String>() {
@Override
public void process(Pair<K, Iterable<String>> input,
Emitter<Pair<K, String>> emitter) {
Set<String> filenames = new HashSet<String>();
for (String filename : input.second()) {
filenames.add(filename);
}
Pair<K, String> pair =
Pair.of(input.first(), StringUtils.join(filenames, ","));
emitter.emit(pair);
}
});
}
public static PTable<String, String> extractWordFileTable(PCollection<String> lines) {
PTypeFamily tf = lines.getTypeFamily();
return lines.parallelDo(
"inverted-index",
new DoFn<String, Pair<String, String>>() {
String filename;
@Override
public void setContext(TaskInputOutputContext<?, ?, ?, ?> context) {
super.setContext(context);
filename = ((FileSplit)
((MapContext) context).getInputSplit()).getPath().getName();
}
@Override
public void process(String line,
Emitter<Pair<String, String>> emitter) {
for (String word : StringUtils.split(line)) {
Pair<String, String> pair =
Pair.of(word.toLowerCase(), filename);
emitter.emit(pair);
}
}
}, tf.tableOf(tf.strings(), tf.strings()));
}
public static enum LogCounters {
LOG_LINE_ERRORS
}
public static PCollection<CommonLogEntry> logs(PCollection<String> lines) {
PTypeFamily tf = lines.getTypeFamily();
return lines
.parallelDo(new DoFn<String, CommonLogEntry>() {
transient ApacheCommonLogReader logReader;
transient Logger log;
@Override
public void initialize() {
logReader = new ApacheCommonLogReader();
log = LoggerFactory.getLogger(CrunchUtils.class);
}
@Override
public void process(String input, Emitter<CommonLogEntry> emitter) {
try {
CommonLogEntry log = logReader.decodeLine(input);
if(log != null) {
emitter.emit(log);
} else {
processingError(input, null);
}
} catch (IOException e) {
processingError(input, e);
}
}
void processingError(String line, @Nullable Throwable t) {
super.getCounter(LogCounters.LOG_LINE_ERRORS).increment(1);
log.error("Hit exception parsing line '" + line + "'", t);
}
}, tf.records(CommonLogEntry.class));
}
}