/**
* Copyright [2012] [Datasalt Systems S.L.]
*
* Licensed under the Apache License, Version 2.0 (the "License");
* you may not use this file except in compliance with the License.
* You may obtain a copy of the License at
*
* http://www.apache.org/licenses/LICENSE-2.0
*
* Unless required by applicable law or agreed to in writing, software
* distributed under the License is distributed on an "AS IS" BASIS,
* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
* See the License for the specific language governing permissions and
* limitations under the License.
*/
package com.datasalt.pangool.solr;
import java.io.File;
import java.io.IOException;
import java.io.InputStream;
import java.text.NumberFormat;
import java.util.ArrayList;
import java.util.Arrays;
import java.util.Collection;
import java.util.Collections;
import java.util.HashMap;
import java.util.HashSet;
import java.util.List;
import java.util.Properties;
import java.util.Set;
import java.util.concurrent.atomic.AtomicLong;
import java.util.regex.Pattern;
import java.util.zip.ZipEntry;
import java.util.zip.ZipOutputStream;
import org.apache.commons.io.FileUtils;
import org.apache.commons.logging.Log;
import org.apache.commons.logging.LogFactory;
import org.apache.commons.logging.impl.Jdk14Logger;
import org.apache.commons.logging.impl.Log4JLogger;
import org.apache.hadoop.conf.Configuration;
import org.apache.hadoop.filecache.DistributedCache;
import org.apache.hadoop.fs.FSDataInputStream;
import org.apache.hadoop.fs.FSDataOutputStream;
import org.apache.hadoop.fs.FileStatus;
import org.apache.hadoop.fs.FileSystem;
import org.apache.hadoop.fs.LocalFileSystem;
import org.apache.hadoop.fs.Path;
import org.apache.hadoop.io.IOUtils;
import org.apache.hadoop.io.NullWritable;
import org.apache.hadoop.mapreduce.RecordWriter;
import org.apache.hadoop.mapreduce.Reducer;
import org.apache.hadoop.mapreduce.TaskAttemptContext;
import org.apache.hadoop.mapreduce.TaskID;
import org.apache.hadoop.mapreduce.lib.output.FileOutputFormat;
import org.apache.log4j.Level;
import org.apache.log4j.Logger;
import org.apache.solr.client.solrj.SolrServerException;
import org.apache.solr.client.solrj.embedded.EmbeddedSolrServer;
import org.apache.solr.common.SolrInputDocument;
import org.apache.solr.core.CoreContainer;
import org.apache.solr.core.CoreDescriptor;
import org.apache.solr.core.SolrCore;
import org.apache.solr.core.SolrResourceLoader;
import com.datasalt.pangool.io.ITuple;
/**
* Instantiate a record writer that will build a Solr index.
*
* A zip file containing the solr config and additional libraries is expected to be passed via the distributed cache.
* The incoming written records are converted via the specified document converter, and written to the index in batches.
* When the job is done, the close copies the index to the destination output file system.
* <p>
* <b>This class has been copied from SOLR-1301 patch although it might be slightly different from it.</b>
* <p>
*/
public class SolrRecordWriter extends RecordWriter<ITuple, NullWritable> {
static final Log LOG = LogFactory.getLog(SolrRecordWriter.class);
public final static List<String> allowedConfigDirectories = new ArrayList<String>(Arrays.asList(new String[] {
"conf", "lib" }));
public final static Set<String> requiredConfigDirectories = new HashSet<String>();
static {
requiredConfigDirectories.add("conf");
}
/**
* Return the list of directories names that may be included in the configuration data passed to the tasks.
*
* @return an UnmodifiableList of directory names
*/
public static List<String> getAllowedConfigDirectories() {
return Collections.unmodifiableList(allowedConfigDirectories);
}
/**
* check if the passed in directory is required to be present in the configuration data set.
*
* @param directory
* The directory to check
* @return true if the directory is required.
*/
public static boolean isRequiredConfigDirectory(final String directory) {
return requiredConfigDirectories.contains(directory);
}
private TupleDocumentConverter converter;
private EmbeddedSolrServer solr;
private SolrCore core;
private FileSystem fs;
private int batchSize;
/** The path that the final index will be written to */
private Path perm;
/** The location in a local temporary directory that the index is built in. */
private Path temp;
/** The directory that the configuration zip file was unpacked into. */
private Path solrHome = null;
private static AtomicLong sequence = new AtomicLong(0);
/**
* If true, create a zip file of the completed index in the final storage location A .zip will be appended to the
* final output name if it is not already present.
*/
private boolean outputZipFile = false;
private Configuration conf;
HeartBeater heartBeater = null;
private BatchWriter batchWriter = null;
private String localSolrHome;
private String zipName;
@SuppressWarnings("rawtypes")
private static HashMap<TaskID, Reducer.Context> contextMap = new HashMap<TaskID, Reducer.Context>();
protected boolean isClosing() {
return closing;
}
protected void setClosing(boolean closing) {
this.closing = closing;
}
/** If true, writes will throw an exception */
volatile boolean closing = false;
private String getOutFileName(TaskAttemptContext context, String prefix) {
TaskID taskId = context.getTaskAttemptID().getTaskID();
int partition = taskId.getId();
NumberFormat nf = NumberFormat.getInstance();
nf.setMinimumIntegerDigits(5);
nf.setGroupingUsed(false);
StringBuilder result = new StringBuilder();
result.append(prefix);
result.append("-");
result.append(nf.format(partition));
return result.toString();
}
public SolrRecordWriter(int batchSize, boolean outputZipFile, int threadCount, int queueSize, String localSolrHome,
String zipName, TupleDocumentConverter converter, TaskAttemptContext context) {
this.localSolrHome = localSolrHome;
this.zipName = zipName;
conf = context.getConfiguration();
this.batchSize = batchSize;
setLogLevel("org.apache.solr.core", "WARN");
setLogLevel("org.apache.solr.update", "WARN");
Logger.getLogger("org.apache.solr.core").setLevel(Level.WARN);
Logger.getLogger("org.apache.solr.update").setLevel(Level.WARN);
java.util.logging.Logger.getLogger("org.apache.solr.core").setLevel(java.util.logging.Level.WARNING);
java.util.logging.Logger.getLogger("org.apache.solr.update").setLevel(java.util.logging.Level.WARNING);
setLogLevel("org.apache.solr", "WARN");
Logger.getLogger("org.apache.solr").setLevel(Level.WARN);
java.util.logging.Logger.getLogger("org.apache.solr").setLevel(java.util.logging.Level.WARNING);
heartBeater = new HeartBeater(context);
try {
heartBeater.needHeartBeat();
/** The actual file in hdfs that holds the configuration. */
this.outputZipFile = outputZipFile;
this.fs = FileSystem.get(conf);
perm = new Path(FileOutputFormat.getOutputPath(context), getOutFileName(context, "part"));
// Make a task unique name that contains the actual index output name to
// make debugging simpler
// Note: if using JVM reuse, the sequence number will not be reset for a
// new task using the jvm
temp = conf.getLocalPath("mapred.local.dir",
"solr_" + conf.get("mapred.task.id") + '.' + sequence.incrementAndGet());
if(outputZipFile && !perm.getName().endsWith(".zip")) {
perm = perm.suffix(".zip");
}
fs.delete(perm, true); // delete old, if any
Path local = fs.startLocalOutput(perm, temp);
solrHome = findSolrConfig(conf);
// }
// Verify that the solr home has a conf and lib directory
if(solrHome == null) {
throw new IOException("Unable to find solr home setting");
}
// Setup a solr instance that we can batch writes to
LOG.info("SolrHome: " + solrHome.toUri());
String dataDir = new File(local.toString(), "data").toString();
// copy the schema to the conf dir
File confDir = new File(local.toString(), "conf");
confDir.mkdirs();
File unpackedSolrHome = new File(solrHome.toString());
FileUtils.copyDirectory(new File(unpackedSolrHome, "conf"), confDir);
Properties props = new Properties();
props.setProperty("solr.data.dir", dataDir);
props.setProperty("solr.home", solrHome.toString());
SolrResourceLoader loader = new SolrResourceLoader(solrHome.toString(), null, props);
LOG.info(String
.format(
"Constructed instance information solr.home %s (%s), instance dir %s, conf dir %s, writing index to temporary directory %s, with permdir %s",
solrHome, solrHome.toUri(), loader.getInstanceDir(), loader.getConfigDir(), dataDir, perm));
CoreContainer container = new CoreContainer(loader);
CoreDescriptor descr = new CoreDescriptor(container, "core1", solrHome.toString());
descr.setDataDir(dataDir);
descr.setCoreProperties(props);
core = container.create(descr);
container.register(core, false);
solr = new EmbeddedSolrServer(container, "core1");
batchWriter = new BatchWriter(solr, batchSize, context.getTaskAttemptID().getTaskID(), threadCount, queueSize);
this.converter = converter;
} catch(Exception e) {
throw new IllegalStateException(String.format("Failed to initialize record writer for %s, %s",
context.getJobName(), conf.get("mapred.task.id")), e);
} finally {
heartBeater.cancelHeartBeat();
}
}
public static void incrementCounter(TaskID taskId, String groupName, String counterName, long incr) {
@SuppressWarnings("rawtypes")
Reducer.Context context = contextMap.get(taskId);
if(context != null) {
context.getCounter(groupName, counterName).increment(incr);
}
}
public static void addReducerContext(@SuppressWarnings("rawtypes") Reducer.Context context) {
TaskID taskID = context.getTaskAttemptID().getTaskID();
if(contextMap.get(taskID) == null) {
contextMap.put(taskID, context);
}
}
private Path findSolrConfig(Configuration conf) throws IOException {
Path solrHome = null;
// we added these lines to make this patch work on Hadoop 0.20.2
FileSystem localFs = FileSystem.getLocal(conf);
if(FileSystem.get(conf).equals(localFs)) {
return new Path(localSolrHome);
}
// end-of-addition
Path[] localArchives = DistributedCache.getLocalCacheArchives(conf);
if(localArchives.length == 0) {
throw new IOException(String.format("No local cache archives, where is %s", zipName));
}
for(Path unpackedDir : localArchives) {
// Only logged if debugging
if(LOG.isDebugEnabled()) {
LOG.debug(String.format("Examining unpack directory %s for %s", unpackedDir, zipName));
ProcessBuilder lsCmd = new ProcessBuilder(new String[] { "/bin/ls", "-lR", unpackedDir.toString() });
lsCmd.redirectErrorStream();
Process ls = lsCmd.start();
try {
byte[] buf = new byte[16 * 1024];
InputStream all = ls.getInputStream();
int count;
while((count = all.read(buf)) > 0) {
System.err.write(buf, 0, count);
}
} catch(IOException ignore) {
}
System.err.format("Exit value is %d%n", ls.exitValue());
}
if(unpackedDir.getName().equals(zipName)) {
solrHome = unpackedDir;
break;
}
}
return solrHome;
}
Collection<SolrInputDocument> batch = new ArrayList<SolrInputDocument>();
/**
* Write a record. This method accumulates records in to a batch, and when {@link #batchSize} items are present
* flushes it to the indexer. The writes can take a substantial amount of time, depending on {@link #batchSize}. If
* there is heavy disk contention the writes may take more than the 600 second default timeout.
*/
@Override
public void write(ITuple key, NullWritable value) throws IOException {
if(isClosing()) {
throw new IOException("Index is already closing");
}
heartBeater.needHeartBeat();
try {
try {
batch.add(converter.convert(key, value));
if(batch.size() > batchSize) {
batchWriter.queueBatch(batch);
batch.clear();
}
} catch(SolrServerException e) {
throw new IOException(e);
}
} finally {
heartBeater.cancelHeartBeat();
}
}
@Override
public void close(TaskAttemptContext context) throws IOException, InterruptedException {
if(context != null) {
heartBeater.setProgress(context);
}
try {
if(batch.size() > 0) {
batchWriter.queueBatch(batch);
batch.clear();
}
heartBeater.needHeartBeat();
batchWriter.close(context, core);
if(outputZipFile) {
context.setStatus("Writing Zip");
packZipFile(); // Written to the perm location
} else {
context.setStatus("Copying Index");
fs.completeLocalOutput(perm, temp); // copy to dfs
}
} catch(Exception e) {
if(e instanceof IOException) {
throw (IOException) e;
}
throw new IOException(e);
} finally {
heartBeater.cancelHeartBeat();
File tempFile = new File(temp.toString());
if(tempFile.exists()) {
FileUtils.forceDelete(new File(temp.toString()));
}
}
context.setStatus("Done");
}
private void packZipFile() throws IOException {
FSDataOutputStream out = null;
ZipOutputStream zos = null;
int zipCount = 0;
LOG.info("Packing zip file for " + perm);
try {
out = fs.create(perm, false);
zos = new ZipOutputStream(out);
String name = perm.getName().replaceAll(".zip$", "");
LOG.info("adding index directory" + temp);
zipCount = zipDirectory(conf, zos, name, temp.toString(), temp);
} catch(Throwable ohFoo) {
LOG.error("packZipFile exception", ohFoo);
if(ohFoo instanceof RuntimeException) {
throw (RuntimeException) ohFoo;
}
if(ohFoo instanceof IOException) {
throw (IOException) ohFoo;
}
throw new IOException(ohFoo);
} finally {
if(zos != null) {
if(zipCount == 0) { // If no entries were written, only close out, as
// the zip will throw an error
LOG.error("No entries written to zip file " + perm);
fs.delete(perm, false);
// out.close();
} else {
LOG.info(String.format("Wrote %d items to %s for %s", zipCount, perm, temp));
zos.close();
}
}
}
}
/**
* Write a file to a zip output stream, removing leading path name components from the actual file name when creating
* the zip file entry.
*
* The entry placed in the zip file is <code>baseName</code>/ <code>relativePath</code>, where
* <code>relativePath</code> is constructed by removing a leading <code>root</code> from the path for
* <code>itemToZip</code>.
*
* If <code>itemToZip</code> is an empty directory, it is ignored. If <code>itemToZip</code> is a directory, the
* contents of the directory are added recursively.
*
* @param zos
* The zip output stream
* @param baseName
* The base name to use for the file name entry in the zip file
* @param root
* The path to remove from <code>itemToZip</code> to make a relative path name
* @param itemToZip
* The path to the file to be added to the zip file
* @return the number of entries added
* @throws IOException
*/
static public int zipDirectory(final Configuration conf, final ZipOutputStream zos, final String baseName,
final String root, final Path itemToZip) throws IOException {
LOG.info(String.format("zipDirectory: %s %s %s", baseName, root, itemToZip));
LocalFileSystem localFs = FileSystem.getLocal(conf);
int count = 0;
final FileStatus itemStatus = localFs.getFileStatus(itemToZip);
if(itemStatus.isDir()) {
final FileStatus[] statai = localFs.listStatus(itemToZip);
// Add a directory entry to the zip file
final String zipDirName = relativePathForZipEntry(itemToZip.toUri().getPath(), baseName, root);
final ZipEntry dirZipEntry = new ZipEntry(zipDirName + Path.SEPARATOR_CHAR);
LOG.info(String.format("Adding directory %s to zip", zipDirName));
zos.putNextEntry(dirZipEntry);
zos.closeEntry();
count++;
if(statai == null || statai.length == 0) {
LOG.info(String.format("Skipping empty directory %s", itemToZip));
return count;
}
for(FileStatus status : statai) {
count += zipDirectory(conf, zos, baseName, root, status.getPath());
}
LOG.info(String.format("Wrote %d entries for directory %s", count, itemToZip));
return count;
}
final String inZipPath = relativePathForZipEntry(itemToZip.toUri().getPath(), baseName, root);
if(inZipPath.length() == 0) {
LOG.warn(String.format("Skipping empty zip file path for %s (%s %s)", itemToZip, root, baseName));
return 0;
}
// Take empty files in case the place holder is needed
FSDataInputStream in = null;
try {
in = localFs.open(itemToZip);
final ZipEntry ze = new ZipEntry(inZipPath);
ze.setTime(itemStatus.getModificationTime());
// Comments confuse looking at the zip file
// ze.setComment(itemToZip.toString());
zos.putNextEntry(ze);
IOUtils.copyBytes(in, zos, conf, false);
zos.closeEntry();
LOG.info(String.format("Wrote %d entries for file %s", count, itemToZip));
return 1;
} finally {
in.close();
}
}
static String relativePathForZipEntry(final String rawPath, final String baseName, final String root) {
String relativePath = rawPath.replaceFirst(Pattern.quote(root.toString()), "");
LOG.info(String.format("RawPath %s, baseName %s, root %s, first %s", rawPath, baseName, root, relativePath));
if(relativePath.startsWith(Path.SEPARATOR)) {
relativePath = relativePath.substring(1);
}
LOG.info(String.format("RawPath %s, baseName %s, root %s, post leading slash %s", rawPath, baseName, root,
relativePath));
if(relativePath.isEmpty()) {
LOG.warn(String.format("No data after root (%s) removal from raw path %s", root, rawPath));
return baseName;
}
// Construct the path that will be written to the zip file, including
// removing any leading '/' characters
String inZipPath = baseName + Path.SEPARATOR_CHAR + relativePath;
LOG.info(String.format("RawPath %s, baseName %s, root %s, inZip 1 %s", rawPath, baseName, root, inZipPath));
if(inZipPath.startsWith(Path.SEPARATOR)) {
inZipPath = inZipPath.substring(1);
}
LOG.info(String.format("RawPath %s, baseName %s, root %s, inZip 2 %s", rawPath, baseName, root, inZipPath));
return inZipPath;
}
static boolean setLogLevel(String packageName, String level) {
Log logger = LogFactory.getLog(packageName);
if(logger == null) {
return false;
}
// look for: org.apache.commons.logging.impl.SLF4JLocationAwareLog
LOG.warn("logger class:" + logger.getClass().getName());
if(logger instanceof Log4JLogger) {
process(((Log4JLogger) logger).getLogger(), level);
return true;
}
if(logger instanceof Jdk14Logger) {
process(((Jdk14Logger) logger).getLogger(), level);
return true;
}
return false;
}
public static void process(org.apache.log4j.Logger log, String level) {
if(level != null) {
log.setLevel(org.apache.log4j.Level.toLevel(level));
}
}
public static void process(java.util.logging.Logger log, String level) {
if(level != null) {
log.setLevel(java.util.logging.Level.parse(level));
}
}
}