Source Code of eu.stratosphere.api.common.io.DelimitedInputFormat$ConfigBuilder

/***********************************************************************************************************************
 * Copyright (C) 2010-2013 by the Stratosphere project (http://stratosphere.eu)
 *
 * Licensed under the Apache License, Version 2.0 (the "License"); you may not use this file except in compliance with
 * the License. You may obtain a copy of the License at
 *
 *     http://www.apache.org/licenses/LICENSE-2.0
 *
 * Unless required by applicable law or agreed to in writing, software distributed under the License is distributed on
 * an "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. See the License for the
 * specific language governing permissions and limitations under the License.
 **********************************************************************************************************************/


package eu.stratosphere.api.common.io;


import java.io.IOException;
import java.nio.charset.Charset;
import java.nio.charset.IllegalCharsetNameException;
import java.nio.charset.UnsupportedCharsetException;
import java.util.ArrayList;


import org.apache.commons.logging.Log;
import org.apache.commons.logging.LogFactory;


import com.google.common.base.Charsets;


import eu.stratosphere.api.common.io.statistics.BaseStatistics;
import eu.stratosphere.api.common.operators.base.FileDataSourceBase;
import eu.stratosphere.configuration.ConfigConstants;
import eu.stratosphere.configuration.Configuration;
import eu.stratosphere.configuration.GlobalConfiguration;
import eu.stratosphere.core.fs.FileInputSplit;
import eu.stratosphere.core.fs.FileStatus;
import eu.stratosphere.core.fs.FileSystem;
import eu.stratosphere.core.fs.Path;


/**
 * Base implementation for input formats that split the input at a delimiter into records.
 * The parsing of the record bytes into the record has to be implemented in the
 * {@link #readRecord(Object, byte[], int, int)} method.
 * <p>
 * The default delimiter is the newline character {@code '\n'}.
 */
public abstract class DelimitedInputFormat<OT> extends FileInputFormat<OT> {
  
  private static final long serialVersionUID = 1L;


  // -------------------------------------- Constants -------------------------------------------
  
  /**
   * The log.
   */
  private static final Log LOG = LogFactory.getLog(DelimitedInputFormat.class);
  
  /**
   * The default read buffer size = 1MB.
   */
  private static final int DEFAULT_READ_BUFFER_SIZE = 1024 * 1024;
  
  /**
   * Indication that the number of samples has not been set by the configuration.
   */
  private static final int NUM_SAMPLES_UNDEFINED = -1;
  
  /**
   * The maximum number of line samples to be taken.
   */
  private static int DEFAULT_MAX_NUM_SAMPLES;
  
  /**
   * The minimum number of line samples to be taken.
   */
  private static int DEFAULT_MIN_NUM_SAMPLES;
  
  /**
   * The maximum size of a sample record before sampling is aborted. To catch cases where a wrong delimiter is given.
   */
  private static int MAX_SAMPLE_LEN;
  
  static { loadGloablConfigParams(); }
  
  protected static final void loadGloablConfigParams() {
    int maxSamples = GlobalConfiguration.getInteger(ConfigConstants.DELIMITED_FORMAT_MAX_LINE_SAMPLES_KEY,
        ConfigConstants.DEFAULT_DELIMITED_FORMAT_MAX_LINE_SAMPLES);
    int minSamples = GlobalConfiguration.getInteger(ConfigConstants.DELIMITED_FORMAT_MIN_LINE_SAMPLES_KEY,
      ConfigConstants.DEFAULT_DELIMITED_FORMAT_MIN_LINE_SAMPLES);
    
    if (maxSamples < 0) {
      LOG.error("Invalid default maximum number of line samples: " + maxSamples + ". Using default value of " +
        ConfigConstants.DEFAULT_DELIMITED_FORMAT_MAX_LINE_SAMPLES);
      maxSamples = ConfigConstants.DEFAULT_DELIMITED_FORMAT_MAX_LINE_SAMPLES;
    }
    if (minSamples < 0) {
      LOG.error("Invalid default minimum number of line samples: " + minSamples + ". Using default value of " +
        ConfigConstants.DEFAULT_DELIMITED_FORMAT_MIN_LINE_SAMPLES);
      minSamples = ConfigConstants.DEFAULT_DELIMITED_FORMAT_MIN_LINE_SAMPLES;
    }
    
    DEFAULT_MAX_NUM_SAMPLES = maxSamples;
    
    if (minSamples > maxSamples) {
      LOG.error("Defaul minimum number of line samples cannot be greater the default maximum number " +
          "of line samples: min=" + minSamples + ", max=" + maxSamples + ". Defaulting minumum to maximum.");
      DEFAULT_MIN_NUM_SAMPLES = maxSamples;
    } else {
      DEFAULT_MIN_NUM_SAMPLES = minSamples;
    }
    
    int maxLen = GlobalConfiguration.getInteger(ConfigConstants.DELIMITED_FORMAT_MAX_SAMPLE_LENGTH_KEY,
        ConfigConstants.DEFAULT_DELIMITED_FORMAT_MAX_SAMPLE_LEN);
    if (maxLen <= 0) {
      maxLen = ConfigConstants.DEFAULT_DELIMITED_FORMAT_MAX_SAMPLE_LEN;
      LOG.error("Invalid value for the maximum sample record length. Using defailt value of " + maxLen + '.');
    } else if (maxLen < DEFAULT_READ_BUFFER_SIZE) {
      maxLen = DEFAULT_READ_BUFFER_SIZE;
      LOG.warn("Increasing maximum sample record length to size of the read buffer (" + maxLen + ").");
    }
    MAX_SAMPLE_LEN = maxLen;
  }
  
  // --------------------------------------------------------------------------------------------
  //  Variables for internal parsing.
  //  They are all transient, because we do not want them so be serialized 
  // --------------------------------------------------------------------------------------------
  
  private transient byte[] readBuffer;


  private transient byte[] wrapBuffer;


  private transient int readPos;


  private transient int limit;
  
  private transient byte[] currBuffer;    // buffer in which current record byte sequence is found
  private transient int currOffset;      // offset in above buffer
  private transient int currLen;        // length of current byte sequence


  private transient boolean overLimit;


  private transient boolean end;
  
  
  // --------------------------------------------------------------------------------------------
  //  The configuration parameters. Configured on the instance and serialized to be shipped.
  // --------------------------------------------------------------------------------------------
  
  private byte[] delimiter = new byte[] {'\n'};
  
  private int lineLengthLimit = Integer.MAX_VALUE;
  
  private int bufferSize = -1;
  
  private int numLineSamples = NUM_SAMPLES_UNDEFINED;
  
  
  // --------------------------------------------------------------------------------------------
  //  Constructors & Getters/setters for the configurable parameters
  // --------------------------------------------------------------------------------------------
  
  public DelimitedInputFormat() {
    super();
  }
  
  protected DelimitedInputFormat(Path filePath) {
    super(filePath);
  }
  
  
  public byte[] getDelimiter() {
    return delimiter;
  }
  
  public void setDelimiter(byte[] delimiter) {
    if (delimiter == null) {
      throw new IllegalArgumentException("Delimiter must not be null");
    }
    
    this.delimiter = delimiter;
  }
  
  public void setDelimiter(char delimiter) {
    setDelimiter(String.valueOf(delimiter));
  }
  
  public void setDelimiter(String delimiter) {
    setDelimiter(delimiter, Charsets.UTF_8);
  }
  
  public void setDelimiter(String delimiter, String charsetName) throws IllegalCharsetNameException, UnsupportedCharsetException {
    if (charsetName == null) {
      throw new IllegalArgumentException("Charset name must not be null");
    }
    
    Charset charset = Charset.forName(charsetName);
    setDelimiter(delimiter, charset);
  }
  
  public void setDelimiter(String delimiter, Charset charset) {
    if (delimiter == null) {
      throw new IllegalArgumentException("Delimiter must not be null");
    }
    if (charset == null) {
      throw new IllegalArgumentException("Charset must not be null");
    }
    
    this.delimiter = delimiter.getBytes(charset);
  }
  
  public int getLineLengthLimit() {
    return lineLengthLimit;
  }
  
  public void setLineLengthLimit(int lineLengthLimit) {
    if (lineLengthLimit < 1) {
      throw new IllegalArgumentException("Line length limit must be at least 1.");
    }


    this.lineLengthLimit = lineLengthLimit;
  }
  
  public int getBufferSize() {
    return bufferSize;
  }
  
  public void setBufferSize(int bufferSize) {
    if (bufferSize < 1) {
      throw new IllegalArgumentException("Buffer size must be at least 1.");
    }
    
    this.bufferSize = bufferSize;
  }
  
  public int getNumLineSamples() {
    return numLineSamples;
  }
  
  public void setNumLineSamples(int numLineSamples) {
    if (numLineSamples < 0) {
      throw new IllegalArgumentException("Number of line samples must not be negative.");
    }
    
    this.numLineSamples = numLineSamples;
  }
  
  // --------------------------------------------------------------------------------------------
  //  User-defined behavior
  // --------------------------------------------------------------------------------------------


  /**
   * This function parses the given byte array which represents a serialized records.
   * The parsed content is then returned by setting the pair variables. If the
   * byte array contains invalid content the record can be skipped by returning <tt>false</tt>.
   * 
   * @param reuse An optionally reusable object.
   * @param bytes Binary data of serialized records.
   * @param offset The offset where to start to read the record data. 
   * @param numBytes The number of bytes that can be read starting at the offset position.
   * 
   * @return returns whether the record was successfully deserialized or not.
   */
  public abstract OT readRecord(OT reuse, byte[] bytes, int offset, int numBytes);
  
  // --------------------------------------------------------------------------------------------
  //  Pre-flight: Configuration, Splits, Sampling
  // --------------------------------------------------------------------------------------------
  
  /**
   * Configures this input format by reading the path to the file from the configuration andge the string that
   * defines the record delimiter.
   * 
   * @param parameters The configuration object to read the parameters from.
   */
  @Override
  public void configure(Configuration parameters) {
    super.configure(parameters);
    
    String delimString = parameters.getString(RECORD_DELIMITER, null);
    if (delimString != null) {
      String charsetName = parameters.getString(RECORD_DELIMITER_ENCODING, null);


      if (charsetName == null) {
        setDelimiter(delimString);
      } else {
        try {
          setDelimiter(delimString, charsetName);
        }
        catch (UnsupportedCharsetException e) {
          throw new IllegalArgumentException("The charset with the name '" + charsetName + 
              "' is not supported on this TaskManager instance.", e);
        }
      }
    }
    
    // set the number of samples
    String samplesString = parameters.getString(NUM_STATISTICS_SAMPLES, null);
    if (samplesString != null) {
      try {
        setNumLineSamples(Integer.parseInt(samplesString));
      }
      catch (NumberFormatException e) {
        if (LOG.isWarnEnabled()) {
          LOG.warn("Invalid value for number of samples to take: " + samplesString + ". Skipping sampling.");
        }
        setNumLineSamples(0);
      }
    }
  }
  
  @Override
  public FileBaseStatistics getStatistics(BaseStatistics cachedStats) throws IOException {
    
    final FileBaseStatistics cachedFileStats = (cachedStats != null && cachedStats instanceof FileBaseStatistics) ?
        (FileBaseStatistics) cachedStats : null;
    
    // store properties
    final long oldTimeout = this.openTimeout;
    final int oldBufferSize = this.bufferSize;
    final int oldLineLengthLimit = this.lineLengthLimit;
    try {
      final Path filePath = this.filePath;
    
      // get the filesystem
      final FileSystem fs = FileSystem.get(filePath.toUri());
      final ArrayList<FileStatus> allFiles = new ArrayList<FileStatus>(1);
      
      // let the file input format deal with the up-to-date check and the basic size
      final FileBaseStatistics stats = getFileStats(cachedFileStats, filePath, fs, allFiles);
      if (stats == null) {
        return null;
      }
      
      // check whether the width per record is already known or the total size is unknown as well
      // in both cases, we return the stats as they are
      if (stats.getAverageRecordWidth() != FileBaseStatistics.AVG_RECORD_BYTES_UNKNOWN ||
          stats.getTotalInputSize() == FileBaseStatistics.SIZE_UNKNOWN) {
        return stats;
      }
      
      // disabling sampling for unsplittable files since the logic below assumes splitability.
      // TODO: Add sampling for unsplittable files. Right now, only compressed text files are affected by this limitation.
      if(unsplittable) {
        return stats;
      }
      
      // compute how many samples to take, depending on the defined upper and lower bound
      final int numSamples;
      if (this.numLineSamples != NUM_SAMPLES_UNDEFINED) {
        numSamples = this.numLineSamples;
      } else {
        // make the samples small for very small files
        final int calcSamples = (int) (stats.getTotalInputSize() / 1024);
        numSamples = Math.min(DEFAULT_MAX_NUM_SAMPLES, Math.max(DEFAULT_MIN_NUM_SAMPLES, calcSamples));
      }
      
      // check if sampling is disabled.
      if (numSamples == 0) {
        return stats;
      }
      if (numSamples < 0) {
        throw new RuntimeException("Error: Invalid number of samples: " + numSamples);
      }
      
      
      // make sure that the sampling times out after a while if the file system does not answer in time
      this.openTimeout = 10000;
      // set a small read buffer size
      this.bufferSize = 4 * 1024;
      // prevent overly large records, for example if we have an incorrectly configured delimiter
      this.lineLengthLimit = MAX_SAMPLE_LEN;
      
      long offset = 0;
      long totalNumBytes = 0;
      long stepSize = stats.getTotalInputSize() / numSamples;


      int fileNum = 0;
      int samplesTaken = 0;


      // take the samples
      while (samplesTaken < numSamples && fileNum < allFiles.size()) {
        // make a split for the sample and use it to read a record
        FileStatus file = allFiles.get(fileNum);
        FileInputSplit split = new FileInputSplit(0, file.getPath(), offset, file.getLen() - offset, null);


        // we open the split, read one line, and take its length
        try {
          open(split);
          if (readLine()) {
            totalNumBytes += this.currLen + this.delimiter.length;
            samplesTaken++;
          }
        } finally {
          // close the file stream, do not release the buffers
          super.close();
        }


        offset += stepSize;


        // skip to the next file, if necessary
        while (fileNum < allFiles.size() && offset >= (file = allFiles.get(fileNum)).getLen()) {
          offset -= file.getLen();
          fileNum++;
        }
      }
      
      // we have the width, store it
      return new FileBaseStatistics(stats.getLastModificationTime(),
        stats.getTotalInputSize(), totalNumBytes / (float) samplesTaken);
      
    } catch (IOException ioex) {
      if (LOG.isWarnEnabled()) {
        LOG.warn("Could not determine statistics for file '" + this.filePath + "' due to an io error: "
            + ioex.getMessage());
      }
    }
    catch (Throwable t) {
      if (LOG.isErrorEnabled()) {
        LOG.error("Unexpected problen while getting the file statistics for file '" + this.filePath + "': "
            + t.getMessage(), t);
      }
    } finally {
      // restore properties (even on return)
      this.openTimeout = oldTimeout;
      this.bufferSize = oldBufferSize;
      this.lineLengthLimit = oldLineLengthLimit;
    }
    
    // no statistics possible
    return null;
  }


  /**
   * Opens the given input split. This method opens the input stream to the specified file, allocates read buffers
   * and positions the stream at the correct position, making sure that any partial record at the beginning is skipped.
   * 
   * @param split The input split to open.
   * 
   * @see eu.stratosphere.api.common.io.FileInputFormat#open(eu.stratosphere.core.fs.FileInputSplit)
   */
  @Override
  public void open(FileInputSplit split) throws IOException {
    super.open(split);
    
    this.bufferSize = this.bufferSize <= 0 ? DEFAULT_READ_BUFFER_SIZE : this.bufferSize;
    
    if (this.readBuffer == null || this.readBuffer.length != this.bufferSize) {
      this.readBuffer = new byte[this.bufferSize];
    }
    if (this.wrapBuffer == null || this.wrapBuffer.length < 256) {
      this.wrapBuffer = new byte[256];
    }


    this.readPos = 0;
    this.limit = 0;
    this.overLimit = false;
    this.end = false;


    if (this.splitStart != 0) {
      this.stream.seek(this.splitStart);
      readLine();
      
      // if the first partial record already pushes the stream over the limit of our split, then no
      // record starts within this split 
      if (this.overLimit) {
        this.end = true;
      }
    } else {
      fillBuffer();
    }
  }


  /**
   * Checks whether the current split is at its end.
   * 
   * @return True, if the split is at its end, false otherwise.
   */
  @Override
  public boolean reachedEnd() {
    return this.end;
  }
  
  @Override
  public OT nextRecord(OT record) throws IOException {
    if (readLine()) {
      return readRecord(record, this.currBuffer, this.currOffset, this.currLen);
    } else {
      this.end = true;
      return null;
    }
  }


  /**
   * Closes the input by releasing all buffers and closing the file input stream.
   * 
   * @throws IOException Thrown, if the closing of the file stream causes an I/O error.
   */
  @Override
  public void close() throws IOException {
    this.wrapBuffer = null;
    this.readBuffer = null;
    super.close();
  }


  // --------------------------------------------------------------------------------------------


  protected final boolean readLine() throws IOException {
    if (this.stream == null || this.overLimit) {
      return false;
    }


    int countInWrapBuffer = 0;


    /* position of matching positions in the delimiter byte array */
    int i = 0;


    while (true) {
      if (this.readPos >= this.limit) {
        if (!fillBuffer()) {
          if (countInWrapBuffer > 0) {
            setResult(this.wrapBuffer, 0, countInWrapBuffer);
            return true;
          } else {
            return false;
          }
        }
      }


      int startPos = this.readPos;
      int count = 0;


      while (this.readPos < this.limit && i < this.delimiter.length) {
        if ((this.readBuffer[this.readPos++]) == this.delimiter[i]) {
          i++;
        } else {
          i = 0;
        }


      }


      // check why we dropped out
      if (i == this.delimiter.length) {
        // line end
        count = this.readPos - startPos - this.delimiter.length;


        // copy to byte array
        if (countInWrapBuffer > 0) {
          // check wrap buffer size
          if (this.wrapBuffer.length < countInWrapBuffer + count) {
            final byte[] nb = new byte[countInWrapBuffer + count];
            System.arraycopy(this.wrapBuffer, 0, nb, 0, countInWrapBuffer);
            this.wrapBuffer = nb;
          }
          if (count >= 0) {
            System.arraycopy(this.readBuffer, 0, this.wrapBuffer, countInWrapBuffer, count);
          }
          setResult(this.wrapBuffer, 0, countInWrapBuffer + count);
          return true;
        } else {
          setResult(this.readBuffer, startPos, count);
          return true;
        }
      } else {
        count = this.limit - startPos;
        
        // check against the maximum record length
        if ( ((long) countInWrapBuffer) + count > this.lineLengthLimit) {
          throw new IOException("The record length exceeded the maximum record length (" + 
              this.lineLengthLimit + ").");
        }


        // buffer exhausted
        if (this.wrapBuffer.length - countInWrapBuffer < count) {
          // reallocate
          byte[] tmp = new byte[Math.max(this.wrapBuffer.length * 2, countInWrapBuffer + count)];
          System.arraycopy(this.wrapBuffer, 0, tmp, 0, countInWrapBuffer);
          this.wrapBuffer = tmp;
        }


        System.arraycopy(this.readBuffer, startPos, this.wrapBuffer, countInWrapBuffer, count);
        countInWrapBuffer += count;
      }
    }
  }
  
  private final void setResult(byte[] buffer, int offset, int len) {
    this.currBuffer = buffer;
    this.currOffset = offset;
    this.currLen = len;
  }


  private final boolean fillBuffer() throws IOException {
    // special case for reading the whole split.
    if(this.splitLength == FileInputFormat.READ_WHOLE_SPLIT_FLAG) {
      int read = this.stream.read(this.readBuffer, 0, readBuffer.length);
      if (read == -1) {
        this.stream.close();
        this.stream = null;
        return false;
      } else {
        this.readPos = 0;
        this.limit = read;
        return true;
      }
    }
    // else ..
    int toRead = this.splitLength > this.readBuffer.length ? this.readBuffer.length : (int) this.splitLength;
    if (this.splitLength <= 0) {
      toRead = this.readBuffer.length;
      this.overLimit = true;
    }


    int read = this.stream.read(this.readBuffer, 0, toRead);


    if (read == -1) {
      this.stream.close();
      this.stream = null;
      return false;
    } else {
      this.splitLength -= read;
      this.readPos = 0;
      this.limit = read;
      return true;
    }
  }
  
  // ============================================================================================
  //  Parameterization via configuration
  // ============================================================================================
  
  // ------------------------------------- Config Keys ------------------------------------------
  
  /**
   * The configuration key to set the record delimiter.
   */
  protected static final String RECORD_DELIMITER = "delimited-format.delimiter";
  
  /**
   * The configuration key to set the record delimiter encoding.
   */
  private static final String RECORD_DELIMITER_ENCODING = "delimited-format.delimiter-encoding";
  
  /**
   * The configuration key to set the number of samples to take for the statistics.
   */
  private static final String NUM_STATISTICS_SAMPLES = "delimited-format.numSamples";
  
  // ----------------------------------- Config Builder -----------------------------------------
  
  /**
   * Creates a configuration builder that can be used to set the input format's parameters to the config in a fluent
   * fashion.
   * 
   * @return A config builder for setting parameters.
   */
  public static ConfigBuilder configureDelimitedFormat(FileDataSourceBase<?> target) {
    return new ConfigBuilder(target.getParameters());
  }
  
  /**
   * Abstract builder used to set parameters to the input format's configuration in a fluent way.
   */
  protected static class AbstractConfigBuilder<T> extends FileInputFormat.AbstractConfigBuilder<T> {
    
    private static final String NEWLINE_DELIMITER = "\n";
    
    // --------------------------------------------------------------------
    
    /**
     * Creates a new builder for the given configuration.
     * 
     * @param config The configuration into which the parameters will be written.
     */
    protected AbstractConfigBuilder(Configuration config) {
      super(config);
    }
    
    // --------------------------------------------------------------------
    
    /**
     * Sets the delimiter to be a single character, namely the given one. The character must be within
     * the value range <code>0</code> to <code>127</code>.
     * 
     * @param delimiter The delimiter character.
     * @return The builder itself.
     */
    public T recordDelimiter(char delimiter) {
      if (delimiter == '\n') {
        this.config.setString(RECORD_DELIMITER, NEWLINE_DELIMITER);
      } else {
        this.config.setString(RECORD_DELIMITER, String.valueOf(delimiter));
      }
      @SuppressWarnings("unchecked")
      T ret = (T) this;
      return ret;
    }
    
    /**
     * Sets the delimiter to be the given string. The string will be converted to bytes for more efficient
     * comparison during input parsing. The conversion will be done using the platforms default charset.
     * 
     * @param delimiter The delimiter string.
     * @return The builder itself.
     */
    public T recordDelimiter(String delimiter) {
      this.config.setString(RECORD_DELIMITER, delimiter);
      @SuppressWarnings("unchecked")
      T ret = (T) this;
      return ret;
    }
    
    /**
     * Sets the delimiter to be the given string. The string will be converted to bytes for more efficient
     * comparison during input parsing. The conversion will be done using the charset with the given name.
     * The charset must be available on the processing nodes, otherwise an exception will be raised at
     * runtime.
     * 
     * @param delimiter The delimiter string.
     * @param charsetName The name of the encoding character set.
     * @return The builder itself.
     */
    public T recordDelimiter(String delimiter, String charsetName) {
      this.config.setString(RECORD_DELIMITER, delimiter);
      this.config.setString(RECORD_DELIMITER_ENCODING, charsetName);
      @SuppressWarnings("unchecked")
      T ret = (T) this;
      return ret;
    }
    
    /**
     * Sets the number of line samples to take in order to estimate the base statistics for the
     * input format.
     * 
     * @param numSamples The number of line samples to take.
     * @return The builder itself.
     */
    public T numSamplesForStatistics(int numSamples) {
      this.config.setInteger(NUM_STATISTICS_SAMPLES, numSamples);
      @SuppressWarnings("unchecked")
      T ret = (T) this;
      return ret;
    }
  }
  
  /**
   * A builder used to set parameters to the input format's configuration in a fluent way.
   */
  public static class ConfigBuilder extends AbstractConfigBuilder<ConfigBuilder> {
    
    /**
     * Creates a new builder for the given configuration.
     * 
     * @param targetConfig The configuration into which the parameters will be written.
     */
    protected ConfigBuilder(Configuration targetConfig) {
      super(targetConfig);
    }
  }
}
Source Code of eu.stratosphere.api.common.io.DelimitedInputFormat$ConfigBuilder

Related Classes of eu.stratosphere.api.common.io.DelimitedInputFormat$ConfigBuilder