Source Code of com.datasalt.pangool.solr.SolrRecordWriter

/**
 * Copyright [2012] [Datasalt Systems S.L.]
 *
 * Licensed under the Apache License, Version 2.0 (the "License");
 * you may not use this file except in compliance with the License.
 * You may obtain a copy of the License at
 *
 *   http://www.apache.org/licenses/LICENSE-2.0
 *
 * Unless required by applicable law or agreed to in writing, software
 * distributed under the License is distributed on an "AS IS" BASIS,
 * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 * See the License for the specific language governing permissions and
 * limitations under the License.
 */
package com.datasalt.pangool.solr;


import java.io.File;
import java.io.IOException;
import java.io.InputStream;
import java.text.NumberFormat;
import java.util.ArrayList;
import java.util.Arrays;
import java.util.Collection;
import java.util.Collections;
import java.util.HashMap;
import java.util.HashSet;
import java.util.List;
import java.util.Properties;
import java.util.Set;
import java.util.concurrent.atomic.AtomicLong;
import java.util.regex.Pattern;
import java.util.zip.ZipEntry;
import java.util.zip.ZipOutputStream;


import org.apache.commons.io.FileUtils;
import org.apache.commons.logging.Log;
import org.apache.commons.logging.LogFactory;
import org.apache.commons.logging.impl.Jdk14Logger;
import org.apache.commons.logging.impl.Log4JLogger;
import org.apache.hadoop.conf.Configuration;
import org.apache.hadoop.filecache.DistributedCache;
import org.apache.hadoop.fs.FSDataInputStream;
import org.apache.hadoop.fs.FSDataOutputStream;
import org.apache.hadoop.fs.FileStatus;
import org.apache.hadoop.fs.FileSystem;
import org.apache.hadoop.fs.LocalFileSystem;
import org.apache.hadoop.fs.Path;
import org.apache.hadoop.io.IOUtils;
import org.apache.hadoop.io.NullWritable;
import org.apache.hadoop.mapreduce.RecordWriter;
import org.apache.hadoop.mapreduce.Reducer;
import org.apache.hadoop.mapreduce.TaskAttemptContext;
import org.apache.hadoop.mapreduce.TaskID;
import org.apache.hadoop.mapreduce.lib.output.FileOutputFormat;
import org.apache.log4j.Level;
import org.apache.log4j.Logger;
import org.apache.solr.client.solrj.SolrServerException;
import org.apache.solr.client.solrj.embedded.EmbeddedSolrServer;
import org.apache.solr.common.SolrInputDocument;
import org.apache.solr.core.CoreContainer;
import org.apache.solr.core.CoreDescriptor;
import org.apache.solr.core.SolrCore;
import org.apache.solr.core.SolrResourceLoader;


import com.datasalt.pangool.io.ITuple;


/**
 * Instantiate a record writer that will build a Solr index.
 * 
 * A zip file containing the solr config and additional libraries is expected to be passed via the distributed cache.
 * The incoming written records are converted via the specified document converter, and written to the index in batches.
 * When the job is done, the close copies the index to the destination output file system.
 * <p>
 * <b>This class has been copied from SOLR-1301 patch although it might be slightly different from it.</b>
 * <p>
 */
public class SolrRecordWriter extends RecordWriter<ITuple, NullWritable> {
  static final Log LOG = LogFactory.getLog(SolrRecordWriter.class);


  public final static List<String> allowedConfigDirectories = new ArrayList<String>(Arrays.asList(new String[] {
      "conf", "lib" }));


  public final static Set<String> requiredConfigDirectories = new HashSet<String>();
  static {
    requiredConfigDirectories.add("conf");
  }


  /**
   * Return the list of directories names that may be included in the configuration data passed to the tasks.
   * 
   * @return an UnmodifiableList of directory names
   */
  public static List<String> getAllowedConfigDirectories() {
    return Collections.unmodifiableList(allowedConfigDirectories);
  }


  /**
   * check if the passed in directory is required to be present in the configuration data set.
   * 
   * @param directory
   *          The directory to check
   * @return true if the directory is required.
   */
  public static boolean isRequiredConfigDirectory(final String directory) {
    return requiredConfigDirectories.contains(directory);
  }


  private TupleDocumentConverter converter;
  private EmbeddedSolrServer solr;
  private SolrCore core;
  private FileSystem fs;


  private int batchSize;


  /** The path that the final index will be written to */
  private Path perm;
  /** The location in a local temporary directory that the index is built in. */
  private Path temp;
  /** The directory that the configuration zip file was unpacked into. */
  private Path solrHome = null;


  private static AtomicLong sequence = new AtomicLong(0);


  /**
   * If true, create a zip file of the completed index in the final storage location A .zip will be appended to the
   * final output name if it is not already present.
   */
  private boolean outputZipFile = false;
  private Configuration conf;
  HeartBeater heartBeater = null;
  private BatchWriter batchWriter = null;


  private String localSolrHome;
  private String zipName;


  @SuppressWarnings("rawtypes")
  private static HashMap<TaskID, Reducer.Context> contextMap = new HashMap<TaskID, Reducer.Context>();


  protected boolean isClosing() {
    return closing;
  }


  protected void setClosing(boolean closing) {
    this.closing = closing;
  }


  /** If true, writes will throw an exception */
  volatile boolean closing = false;


  private String getOutFileName(TaskAttemptContext context, String prefix) {
    TaskID taskId = context.getTaskAttemptID().getTaskID();
    int partition = taskId.getId();
    NumberFormat nf = NumberFormat.getInstance();
    nf.setMinimumIntegerDigits(5);
    nf.setGroupingUsed(false);
    StringBuilder result = new StringBuilder();
    result.append(prefix);
    result.append("-");
    result.append(nf.format(partition));
    return result.toString();
  }


  public SolrRecordWriter(int batchSize, boolean outputZipFile, int threadCount, int queueSize, String localSolrHome,
      String zipName, TupleDocumentConverter converter, TaskAttemptContext context) {
    this.localSolrHome = localSolrHome;
    this.zipName = zipName;
    conf = context.getConfiguration();
    this.batchSize = batchSize;


    setLogLevel("org.apache.solr.core", "WARN");
    setLogLevel("org.apache.solr.update", "WARN");
    Logger.getLogger("org.apache.solr.core").setLevel(Level.WARN);
    Logger.getLogger("org.apache.solr.update").setLevel(Level.WARN);
    java.util.logging.Logger.getLogger("org.apache.solr.core").setLevel(java.util.logging.Level.WARNING);
    java.util.logging.Logger.getLogger("org.apache.solr.update").setLevel(java.util.logging.Level.WARNING);


    setLogLevel("org.apache.solr", "WARN");
    Logger.getLogger("org.apache.solr").setLevel(Level.WARN);
    java.util.logging.Logger.getLogger("org.apache.solr").setLevel(java.util.logging.Level.WARNING);


    heartBeater = new HeartBeater(context);
    try {
      heartBeater.needHeartBeat();
      /** The actual file in hdfs that holds the configuration. */
      this.outputZipFile = outputZipFile;


      this.fs = FileSystem.get(conf);
      perm = new Path(FileOutputFormat.getOutputPath(context), getOutFileName(context, "part"));


      // Make a task unique name that contains the actual index output name to
      // make debugging simpler
      // Note: if using JVM reuse, the sequence number will not be reset for a
      // new task using the jvm


      temp = conf.getLocalPath("mapred.local.dir",
          "solr_" + conf.get("mapred.task.id") + '.' + sequence.incrementAndGet());


      if(outputZipFile && !perm.getName().endsWith(".zip")) {
        perm = perm.suffix(".zip");
      }
      fs.delete(perm, true); // delete old, if any
      Path local = fs.startLocalOutput(perm, temp);


      solrHome = findSolrConfig(conf);


      // }
      // Verify that the solr home has a conf and lib directory
      if(solrHome == null) {
        throw new IOException("Unable to find solr home setting");
      }


      // Setup a solr instance that we can batch writes to
      LOG.info("SolrHome: " + solrHome.toUri());
      String dataDir = new File(local.toString(), "data").toString();
      // copy the schema to the conf dir
      File confDir = new File(local.toString(), "conf");
      confDir.mkdirs();
      
      File unpackedSolrHome = new File(solrHome.toString());
      FileUtils.copyDirectory(new File(unpackedSolrHome, "conf"), confDir);


      Properties props = new Properties();
      props.setProperty("solr.data.dir", dataDir);
      props.setProperty("solr.home", solrHome.toString());
      SolrResourceLoader loader = new SolrResourceLoader(solrHome.toString(), null, props);
      LOG.info(String
          .format(
              "Constructed instance information solr.home %s (%s), instance dir %s, conf dir %s, writing index to temporary directory %s, with permdir %s",
              solrHome, solrHome.toUri(), loader.getInstanceDir(), loader.getConfigDir(), dataDir, perm));
      CoreContainer container = new CoreContainer(loader);
      CoreDescriptor descr = new CoreDescriptor(container, "core1", solrHome.toString());
      descr.setDataDir(dataDir);
      descr.setCoreProperties(props);
      core = container.create(descr);
      container.register(core, false);
      solr = new EmbeddedSolrServer(container, "core1");
      batchWriter = new BatchWriter(solr, batchSize, context.getTaskAttemptID().getTaskID(), threadCount, queueSize);


      this.converter = converter;
    } catch(Exception e) {
      throw new IllegalStateException(String.format("Failed to initialize record writer for %s, %s",
          context.getJobName(), conf.get("mapred.task.id")), e);
    } finally {
      heartBeater.cancelHeartBeat();
    }
  }


  public static void incrementCounter(TaskID taskId, String groupName, String counterName, long incr) {
    @SuppressWarnings("rawtypes")
    Reducer.Context context = contextMap.get(taskId);
    if(context != null) {
      context.getCounter(groupName, counterName).increment(incr);
    }
  }


  public static void addReducerContext(@SuppressWarnings("rawtypes") Reducer.Context context) {
    TaskID taskID = context.getTaskAttemptID().getTaskID();
    if(contextMap.get(taskID) == null) {
      contextMap.put(taskID, context);
    }
  }


  private Path findSolrConfig(Configuration conf) throws IOException {
    Path solrHome = null;


    // we added these lines to make this patch work on Hadoop 0.20.2
    FileSystem localFs = FileSystem.getLocal(conf);
    if(FileSystem.get(conf).equals(localFs)) {
      return new Path(localSolrHome);
    }
    // end-of-addition
    Path[] localArchives = DistributedCache.getLocalCacheArchives(conf);


    if(localArchives.length == 0) {
      throw new IOException(String.format("No local cache archives, where is %s", zipName));
    }
    for(Path unpackedDir : localArchives) {
      // Only logged if debugging
      if(LOG.isDebugEnabled()) {
        LOG.debug(String.format("Examining unpack directory %s for %s", unpackedDir, zipName));


        ProcessBuilder lsCmd = new ProcessBuilder(new String[] { "/bin/ls", "-lR", unpackedDir.toString() });
        lsCmd.redirectErrorStream();
        Process ls = lsCmd.start();
        try {
          byte[] buf = new byte[16 * 1024];
          InputStream all = ls.getInputStream();
          int count;
          while((count = all.read(buf)) > 0) {
            System.err.write(buf, 0, count);
          }
        } catch(IOException ignore) {
        }
        System.err.format("Exit value is %d%n", ls.exitValue());
      }
      if(unpackedDir.getName().equals(zipName)) {


        solrHome = unpackedDir;
        break;
      }
    }
    return solrHome;
  }


  Collection<SolrInputDocument> batch = new ArrayList<SolrInputDocument>();


  /**
   * Write a record. This method accumulates records in to a batch, and when {@link #batchSize} items are present
   * flushes it to the indexer. The writes can take a substantial amount of time, depending on {@link #batchSize}. If
   * there is heavy disk contention the writes may take more than the 600 second default timeout.
   */
  @Override
  public void write(ITuple key, NullWritable value) throws IOException {
    if(isClosing()) {
      throw new IOException("Index is already closing");
    }
    heartBeater.needHeartBeat();
    try {
      try {
        batch.add(converter.convert(key, value));
        if(batch.size() > batchSize) {
          batchWriter.queueBatch(batch);
          batch.clear();
        }
      } catch(SolrServerException e) {
        throw new IOException(e);
      }
    } finally {
      heartBeater.cancelHeartBeat();
    }


  }


  @Override
  public void close(TaskAttemptContext context) throws IOException, InterruptedException {
    if(context != null) {
      heartBeater.setProgress(context);
    }
    try {
      if(batch.size() > 0) {
        batchWriter.queueBatch(batch);
        batch.clear();
      }
      heartBeater.needHeartBeat();
      batchWriter.close(context, core);
      if(outputZipFile) {
        context.setStatus("Writing Zip");
        packZipFile(); // Written to the perm location
      } else {
        context.setStatus("Copying Index");
        fs.completeLocalOutput(perm, temp); // copy to dfs
      }
    } catch(Exception e) {
      if(e instanceof IOException) {
        throw (IOException) e;
      }
      throw new IOException(e);
    } finally {
      heartBeater.cancelHeartBeat();
      File tempFile = new File(temp.toString());
      if(tempFile.exists()) {
        FileUtils.forceDelete(new File(temp.toString()));
      }
    }


    context.setStatus("Done");
  }


  private void packZipFile() throws IOException {
    FSDataOutputStream out = null;
    ZipOutputStream zos = null;
    int zipCount = 0;
    LOG.info("Packing zip file for " + perm);
    try {
      out = fs.create(perm, false);
      zos = new ZipOutputStream(out);


      String name = perm.getName().replaceAll(".zip$", "");
      LOG.info("adding index directory" + temp);
      zipCount = zipDirectory(conf, zos, name, temp.toString(), temp);
    } catch(Throwable ohFoo) {
      LOG.error("packZipFile exception", ohFoo);
      if(ohFoo instanceof RuntimeException) {
        throw (RuntimeException) ohFoo;
      }
      if(ohFoo instanceof IOException) {
        throw (IOException) ohFoo;
      }
      throw new IOException(ohFoo);


    } finally {
      if(zos != null) {
        if(zipCount == 0) { // If no entries were written, only close out, as
                            // the zip will throw an error
          LOG.error("No entries written to zip file " + perm);
          fs.delete(perm, false);
          // out.close();
        } else {
          LOG.info(String.format("Wrote %d items to %s for %s", zipCount, perm, temp));
          zos.close();
        }
      }
    }
  }


  /**
   * Write a file to a zip output stream, removing leading path name components from the actual file name when creating
   * the zip file entry.
   * 
   * The entry placed in the zip file is <code>baseName</code>/ <code>relativePath</code>, where
   * <code>relativePath</code> is constructed by removing a leading <code>root</code> from the path for
   * <code>itemToZip</code>.
   * 
   * If <code>itemToZip</code> is an empty directory, it is ignored. If <code>itemToZip</code> is a directory, the
   * contents of the directory are added recursively.
   * 
   * @param zos
   *          The zip output stream
   * @param baseName
   *          The base name to use for the file name entry in the zip file
   * @param root
   *          The path to remove from <code>itemToZip</code> to make a relative path name
   * @param itemToZip
   *          The path to the file to be added to the zip file
   * @return the number of entries added
   * @throws IOException
   */
  static public int zipDirectory(final Configuration conf, final ZipOutputStream zos, final String baseName,
      final String root, final Path itemToZip) throws IOException {
    LOG.info(String.format("zipDirectory: %s %s %s", baseName, root, itemToZip));
    LocalFileSystem localFs = FileSystem.getLocal(conf);
    int count = 0;


    final FileStatus itemStatus = localFs.getFileStatus(itemToZip);
    if(itemStatus.isDir()) {
      final FileStatus[] statai = localFs.listStatus(itemToZip);


      // Add a directory entry to the zip file
      final String zipDirName = relativePathForZipEntry(itemToZip.toUri().getPath(), baseName, root);
      final ZipEntry dirZipEntry = new ZipEntry(zipDirName + Path.SEPARATOR_CHAR);
      LOG.info(String.format("Adding directory %s to zip", zipDirName));
      zos.putNextEntry(dirZipEntry);
      zos.closeEntry();
      count++;


      if(statai == null || statai.length == 0) {
        LOG.info(String.format("Skipping empty directory %s", itemToZip));
        return count;
      }
      for(FileStatus status : statai) {
        count += zipDirectory(conf, zos, baseName, root, status.getPath());
      }
      LOG.info(String.format("Wrote %d entries for directory %s", count, itemToZip));
      return count;
    }


    final String inZipPath = relativePathForZipEntry(itemToZip.toUri().getPath(), baseName, root);


    if(inZipPath.length() == 0) {
      LOG.warn(String.format("Skipping empty zip file path for %s (%s %s)", itemToZip, root, baseName));
      return 0;
    }


    // Take empty files in case the place holder is needed
    FSDataInputStream in = null;
    try {
      in = localFs.open(itemToZip);
      final ZipEntry ze = new ZipEntry(inZipPath);
      ze.setTime(itemStatus.getModificationTime());
      // Comments confuse looking at the zip file
      // ze.setComment(itemToZip.toString());
      zos.putNextEntry(ze);


      IOUtils.copyBytes(in, zos, conf, false);
      zos.closeEntry();
      LOG.info(String.format("Wrote %d entries for file %s", count, itemToZip));
      return 1;
    } finally {
      in.close();
    }


  }


  static String relativePathForZipEntry(final String rawPath, final String baseName, final String root) {
    String relativePath = rawPath.replaceFirst(Pattern.quote(root.toString()), "");
    LOG.info(String.format("RawPath %s, baseName %s, root %s, first %s", rawPath, baseName, root, relativePath));


    if(relativePath.startsWith(Path.SEPARATOR)) {
      relativePath = relativePath.substring(1);
    }
    LOG.info(String.format("RawPath %s, baseName %s, root %s, post leading slash %s", rawPath, baseName, root,
        relativePath));
    if(relativePath.isEmpty()) {
      LOG.warn(String.format("No data after root (%s) removal from raw path %s", root, rawPath));
      return baseName;
    }
    // Construct the path that will be written to the zip file, including
    // removing any leading '/' characters
    String inZipPath = baseName + Path.SEPARATOR_CHAR + relativePath;


    LOG.info(String.format("RawPath %s, baseName %s, root %s, inZip 1 %s", rawPath, baseName, root, inZipPath));
    if(inZipPath.startsWith(Path.SEPARATOR)) {
      inZipPath = inZipPath.substring(1);
    }
    LOG.info(String.format("RawPath %s, baseName %s, root %s, inZip 2 %s", rawPath, baseName, root, inZipPath));


    return inZipPath;


  }


  static boolean setLogLevel(String packageName, String level) {
    Log logger = LogFactory.getLog(packageName);
    if(logger == null) {
      return false;
    }
    // look for: org.apache.commons.logging.impl.SLF4JLocationAwareLog
    LOG.warn("logger class:" + logger.getClass().getName());
    if(logger instanceof Log4JLogger) {
      process(((Log4JLogger) logger).getLogger(), level);
      return true;
    }
    if(logger instanceof Jdk14Logger) {
      process(((Jdk14Logger) logger).getLogger(), level);
      return true;
    }
    return false;
  }


  public static void process(org.apache.log4j.Logger log, String level) {
    if(level != null) {
      log.setLevel(org.apache.log4j.Level.toLevel(level));
    }
  }


  public static void process(java.util.logging.Logger log, String level) {
    if(level != null) {
      log.setLevel(java.util.logging.Level.parse(level));
    }
  }
}
Source Code of com.datasalt.pangool.solr.SolrRecordWriter

Related Classes of com.datasalt.pangool.solr.SolrRecordWriter