package com.twitter.elephantbird.mapreduce.output;
import java.io.File;
import java.io.IOException;
import java.io.Reader;
import com.google.common.io.Files;
import com.twitter.elephantbird.util.HadoopCompat;
import org.apache.hadoop.conf.Configuration;
import org.apache.hadoop.fs.FileSystem;
import org.apache.hadoop.fs.FileUtil;
import org.apache.hadoop.fs.Path;
import org.apache.hadoop.fs.PathFilter;
import org.apache.hadoop.mapreduce.RecordWriter;
import org.apache.hadoop.mapreduce.TaskAttemptContext;
import org.apache.hadoop.mapreduce.lib.output.FileOutputCommitter;
import org.apache.hadoop.mapreduce.lib.output.FileOutputFormat;
import org.apache.lucene.analysis.Analyzer;
import org.apache.lucene.document.Document;
import org.apache.lucene.index.IndexWriter;
import org.apache.lucene.index.IndexWriterConfig;
import org.apache.lucene.index.LogByteSizeMergePolicy;
import org.apache.lucene.index.SerialMergeScheduler;
import org.apache.lucene.store.Directory;
import org.apache.lucene.store.NoLockFactory;
import org.apache.lucene.store.SimpleFSDirectory;
import org.apache.lucene.util.Version;
import org.slf4j.Logger;
import org.slf4j.LoggerFactory;
import com.twitter.elephantbird.util.PathFilters;
import com.twitter.elephantbird.util.TaskHeartbeatThread;
/**
* Base class for output formats that write lucene indexes
* <p>
* Subclasses must specify how to convert a key value pair into a {@link Document}
* <p>
* Subclasses may provide an {@link Analyzer} to use during index creation
* (which may be used depending on how documents are created by the subclass)
*
* @author Alex Levenson, based on code written by Kyle Maxwell
*/
public abstract class LuceneIndexOutputFormat<K, V> extends FileOutputFormat<K, V> {
private static final Logger LOG = LoggerFactory.getLogger(LuceneIndexOutputFormat.class);
/**
* Convert a record from the MR framework into a lucene {@link Document}
* You may re-use the same {@link Document} instance for efficiency
*
* @param key the key written to this output format
* @param value the value written to this output format
* @return a lucene Document suitable for insertion into an {@link IndexWriter}
*/
protected abstract Document buildDocument(K key, V value) throws IOException;
/**
* Override this method if you intend to use an {@link Analyzer}
* during index creation. If you do not override this method, {@link NeverTokenizeAnalyzer} will
* be used which will throw an exception if you create a {@link Document} using a method that
* invokes tokenization.
*
* @param conf the job's configuration
* @return an {@link Analyzer} suitable for use by an {@link IndexWriter}
*/
protected Analyzer newAnalyzer(Configuration conf) {
return new NeverTokenizeAnalyzer();
}
/**
* Override to use a different {@link Directory} implementation
*
* You may want to use {@link org.apache.lucene.store.FSDirectory#open}
* which is supposed to select an appropriate
* local FS implementation based on the current OS. However, we have seen cases
* where using this leads to an implementation that hits {@link java.lang.OutOfMemoryError}
* when building large indexes.
*/
protected Directory getDirectoryImplementation(File location) throws IOException {
return new SimpleFSDirectory(location, NoLockFactory.getNoLockFactory());
}
public static IndexWriter createIndexWriter(Directory location, Analyzer analyzer) throws IOException {
return createIndexWriter(location, analyzer, LogByteSizeMergePolicy.DEFAULT_MERGE_FACTOR);
}
public static IndexWriter createIndexWriter(Directory location, Analyzer analyzer, int mergeFactor)
throws IOException {
LOG.info("Creating IndexWriter with:\nDirectory: "
+ location
+ "\nAnalyzer: "
+ analyzer
+ "\nMerge Factor: " + mergeFactor);
IndexWriterConfig idxConfig = new IndexWriterConfig(Version.LUCENE_40, analyzer);
LogByteSizeMergePolicy mergePolicy = new LogByteSizeMergePolicy();
mergePolicy.setMergeFactor(mergeFactor);
mergePolicy.setUseCompoundFile(false);
idxConfig.setMergePolicy(mergePolicy);
idxConfig.setMergeScheduler(new SerialMergeScheduler());
IndexWriter writer = new IndexWriter(location, idxConfig);
return writer;
}
@Override
public RecordWriter<K, V> getRecordWriter(TaskAttemptContext job) throws IOException {
FileOutputCommitter committer = (FileOutputCommitter) this.getOutputCommitter(job);
File tmpDirFile = Files.createTempDir();
Directory directory = getDirectoryImplementation(tmpDirFile);
IndexWriter writer = createIndexWriter(directory, newAnalyzer(HadoopCompat.getConfiguration(job)));
return new IndexRecordWriter(writer, committer, tmpDirFile);
}
private class IndexRecordWriter extends RecordWriter<K, V> {
private IndexWriter writer;
private FileOutputCommitter committer;
private File tmpDirFile;
private long recordsProcessed = 0;
private IndexRecordWriter(IndexWriter writer, FileOutputCommitter committer, File tmpDirFile) {
this.writer = writer;
this.committer = committer;
this.tmpDirFile = tmpDirFile;
}
@Override
public void write(K key, V value) throws IOException {
recordsProcessed++;
if (recordsProcessed % 1000000 == 0) {
LOG.info("Processing record " + recordsProcessed);
}
writer.addDocument(buildDocument(key, value));
}
@Override
public void close(final TaskAttemptContext context) throws IOException, InterruptedException {
TaskHeartbeatThread heartBeat = new TaskHeartbeatThread(context) {
@Override
public void progress() {
String[] filesLeft = tmpDirFile.list();
if (filesLeft != null) {
int remaining = filesLeft.length - 2;
LOG.info("Optimizing " + remaining + " segments");
} else {
LOG.info("Done optimizing segments, heartbeat thread still alive");
}
}
};
try {
LOG.info("Starting heartbeat thread");
heartBeat.start();
Path work = committer.getWorkPath();
Path output = new Path(work, "index-"
+ String.valueOf(HadoopCompat.getTaskAttemptID(context).getTaskID().getId()));
writer.forceMerge(1);
writer.close();
FileSystem fs = FileSystem.get(HadoopCompat.getConfiguration(context));
LOG.info("Copying index to HDFS...");
if (!FileUtil.copy(tmpDirFile, fs, output, true, HadoopCompat.getConfiguration(context))) {
throw new IOException("Failed to copy local index to HDFS!");
}
LOG.info("Index written to: " + output);
} catch (IOException e) {
LOG.error("Error committing index", e);
throw e;
} finally {
// all things must die, eventually
LOG.info("Stopping heartbeat thread");
heartBeat.stop();
}
}
}
/**
* An analyzer that always throws {@link UnsupportedOperationException}
* when {@link #createComponents(String, java.io.Reader)} is called
*<p>
* Useful if you don't intend to use an {@link Analyzer} for tokenization
* but are required to provide one to an {@link org.apache.lucene.index.IndexWriter}
*/
public static class NeverTokenizeAnalyzer extends Analyzer {
@Override
protected TokenStreamComponents createComponents(String fieldName, Reader reader) {
throw new UnsupportedOperationException();
}
}
/**
* Creates a path filter that accepts non-hidden directories that start with "index-"
* This is what the indexes created by this output format look like,
* so this is useful for finding them when traversing the file system
*/
public static PathFilter newIndexDirFilter(Configuration conf) {
return new PathFilters.CompositePathFilter(
PathFilters.newExcludeFilesFilter(conf),
PathFilters.EXCLUDE_HIDDEN_PATHS_FILTER,
new PathFilter() {
@Override
public boolean accept(Path path) {
return path.getName().startsWith("index-");
}
}
);
}
}