/***********************************************************************************************************************
* Copyright (C) 2010-2013 by the Stratosphere project (http://stratosphere.eu)
*
* Licensed under the Apache License, Version 2.0 (the "License"); you may not use this file except in compliance with
* the License. You may obtain a copy of the License at
*
* http://www.apache.org/licenses/LICENSE-2.0
*
* Unless required by applicable law or agreed to in writing, software distributed under the License is distributed on
* an "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. See the License for the
* specific language governing permissions and limitations under the License.
**********************************************************************************************************************/
package eu.stratosphere.api.common.io;
import java.io.IOException;
import java.nio.charset.Charset;
import java.nio.charset.IllegalCharsetNameException;
import java.nio.charset.UnsupportedCharsetException;
import java.util.ArrayList;
import org.apache.commons.logging.Log;
import org.apache.commons.logging.LogFactory;
import com.google.common.base.Charsets;
import eu.stratosphere.api.common.io.statistics.BaseStatistics;
import eu.stratosphere.api.common.operators.base.FileDataSourceBase;
import eu.stratosphere.configuration.ConfigConstants;
import eu.stratosphere.configuration.Configuration;
import eu.stratosphere.configuration.GlobalConfiguration;
import eu.stratosphere.core.fs.FileInputSplit;
import eu.stratosphere.core.fs.FileStatus;
import eu.stratosphere.core.fs.FileSystem;
import eu.stratosphere.core.fs.Path;
/**
* Base implementation for input formats that split the input at a delimiter into records.
* The parsing of the record bytes into the record has to be implemented in the
* {@link #readRecord(Object, byte[], int, int)} method.
* <p>
* The default delimiter is the newline character {@code '\n'}.
*/
public abstract class DelimitedInputFormat<OT> extends FileInputFormat<OT> {
private static final long serialVersionUID = 1L;
// -------------------------------------- Constants -------------------------------------------
/**
* The log.
*/
private static final Log LOG = LogFactory.getLog(DelimitedInputFormat.class);
/**
* The default read buffer size = 1MB.
*/
private static final int DEFAULT_READ_BUFFER_SIZE = 1024 * 1024;
/**
* Indication that the number of samples has not been set by the configuration.
*/
private static final int NUM_SAMPLES_UNDEFINED = -1;
/**
* The maximum number of line samples to be taken.
*/
private static int DEFAULT_MAX_NUM_SAMPLES;
/**
* The minimum number of line samples to be taken.
*/
private static int DEFAULT_MIN_NUM_SAMPLES;
/**
* The maximum size of a sample record before sampling is aborted. To catch cases where a wrong delimiter is given.
*/
private static int MAX_SAMPLE_LEN;
static { loadGloablConfigParams(); }
protected static final void loadGloablConfigParams() {
int maxSamples = GlobalConfiguration.getInteger(ConfigConstants.DELIMITED_FORMAT_MAX_LINE_SAMPLES_KEY,
ConfigConstants.DEFAULT_DELIMITED_FORMAT_MAX_LINE_SAMPLES);
int minSamples = GlobalConfiguration.getInteger(ConfigConstants.DELIMITED_FORMAT_MIN_LINE_SAMPLES_KEY,
ConfigConstants.DEFAULT_DELIMITED_FORMAT_MIN_LINE_SAMPLES);
if (maxSamples < 0) {
LOG.error("Invalid default maximum number of line samples: " + maxSamples + ". Using default value of " +
ConfigConstants.DEFAULT_DELIMITED_FORMAT_MAX_LINE_SAMPLES);
maxSamples = ConfigConstants.DEFAULT_DELIMITED_FORMAT_MAX_LINE_SAMPLES;
}
if (minSamples < 0) {
LOG.error("Invalid default minimum number of line samples: " + minSamples + ". Using default value of " +
ConfigConstants.DEFAULT_DELIMITED_FORMAT_MIN_LINE_SAMPLES);
minSamples = ConfigConstants.DEFAULT_DELIMITED_FORMAT_MIN_LINE_SAMPLES;
}
DEFAULT_MAX_NUM_SAMPLES = maxSamples;
if (minSamples > maxSamples) {
LOG.error("Defaul minimum number of line samples cannot be greater the default maximum number " +
"of line samples: min=" + minSamples + ", max=" + maxSamples + ". Defaulting minumum to maximum.");
DEFAULT_MIN_NUM_SAMPLES = maxSamples;
} else {
DEFAULT_MIN_NUM_SAMPLES = minSamples;
}
int maxLen = GlobalConfiguration.getInteger(ConfigConstants.DELIMITED_FORMAT_MAX_SAMPLE_LENGTH_KEY,
ConfigConstants.DEFAULT_DELIMITED_FORMAT_MAX_SAMPLE_LEN);
if (maxLen <= 0) {
maxLen = ConfigConstants.DEFAULT_DELIMITED_FORMAT_MAX_SAMPLE_LEN;
LOG.error("Invalid value for the maximum sample record length. Using defailt value of " + maxLen + '.');
} else if (maxLen < DEFAULT_READ_BUFFER_SIZE) {
maxLen = DEFAULT_READ_BUFFER_SIZE;
LOG.warn("Increasing maximum sample record length to size of the read buffer (" + maxLen + ").");
}
MAX_SAMPLE_LEN = maxLen;
}
// --------------------------------------------------------------------------------------------
// Variables for internal parsing.
// They are all transient, because we do not want them so be serialized
// --------------------------------------------------------------------------------------------
private transient byte[] readBuffer;
private transient byte[] wrapBuffer;
private transient int readPos;
private transient int limit;
private transient byte[] currBuffer; // buffer in which current record byte sequence is found
private transient int currOffset; // offset in above buffer
private transient int currLen; // length of current byte sequence
private transient boolean overLimit;
private transient boolean end;
// --------------------------------------------------------------------------------------------
// The configuration parameters. Configured on the instance and serialized to be shipped.
// --------------------------------------------------------------------------------------------
private byte[] delimiter = new byte[] {'\n'};
private int lineLengthLimit = Integer.MAX_VALUE;
private int bufferSize = -1;
private int numLineSamples = NUM_SAMPLES_UNDEFINED;
// --------------------------------------------------------------------------------------------
// Constructors & Getters/setters for the configurable parameters
// --------------------------------------------------------------------------------------------
public DelimitedInputFormat() {
super();
}
protected DelimitedInputFormat(Path filePath) {
super(filePath);
}
public byte[] getDelimiter() {
return delimiter;
}
public void setDelimiter(byte[] delimiter) {
if (delimiter == null) {
throw new IllegalArgumentException("Delimiter must not be null");
}
this.delimiter = delimiter;
}
public void setDelimiter(char delimiter) {
setDelimiter(String.valueOf(delimiter));
}
public void setDelimiter(String delimiter) {
setDelimiter(delimiter, Charsets.UTF_8);
}
public void setDelimiter(String delimiter, String charsetName) throws IllegalCharsetNameException, UnsupportedCharsetException {
if (charsetName == null) {
throw new IllegalArgumentException("Charset name must not be null");
}
Charset charset = Charset.forName(charsetName);
setDelimiter(delimiter, charset);
}
public void setDelimiter(String delimiter, Charset charset) {
if (delimiter == null) {
throw new IllegalArgumentException("Delimiter must not be null");
}
if (charset == null) {
throw new IllegalArgumentException("Charset must not be null");
}
this.delimiter = delimiter.getBytes(charset);
}
public int getLineLengthLimit() {
return lineLengthLimit;
}
public void setLineLengthLimit(int lineLengthLimit) {
if (lineLengthLimit < 1) {
throw new IllegalArgumentException("Line length limit must be at least 1.");
}
this.lineLengthLimit = lineLengthLimit;
}
public int getBufferSize() {
return bufferSize;
}
public void setBufferSize(int bufferSize) {
if (bufferSize < 1) {
throw new IllegalArgumentException("Buffer size must be at least 1.");
}
this.bufferSize = bufferSize;
}
public int getNumLineSamples() {
return numLineSamples;
}
public void setNumLineSamples(int numLineSamples) {
if (numLineSamples < 0) {
throw new IllegalArgumentException("Number of line samples must not be negative.");
}
this.numLineSamples = numLineSamples;
}
// --------------------------------------------------------------------------------------------
// User-defined behavior
// --------------------------------------------------------------------------------------------
/**
* This function parses the given byte array which represents a serialized records.
* The parsed content is then returned by setting the pair variables. If the
* byte array contains invalid content the record can be skipped by returning <tt>false</tt>.
*
* @param reuse An optionally reusable object.
* @param bytes Binary data of serialized records.
* @param offset The offset where to start to read the record data.
* @param numBytes The number of bytes that can be read starting at the offset position.
*
* @return returns whether the record was successfully deserialized or not.
*/
public abstract OT readRecord(OT reuse, byte[] bytes, int offset, int numBytes);
// --------------------------------------------------------------------------------------------
// Pre-flight: Configuration, Splits, Sampling
// --------------------------------------------------------------------------------------------
/**
* Configures this input format by reading the path to the file from the configuration andge the string that
* defines the record delimiter.
*
* @param parameters The configuration object to read the parameters from.
*/
@Override
public void configure(Configuration parameters) {
super.configure(parameters);
String delimString = parameters.getString(RECORD_DELIMITER, null);
if (delimString != null) {
String charsetName = parameters.getString(RECORD_DELIMITER_ENCODING, null);
if (charsetName == null) {
setDelimiter(delimString);
} else {
try {
setDelimiter(delimString, charsetName);
}
catch (UnsupportedCharsetException e) {
throw new IllegalArgumentException("The charset with the name '" + charsetName +
"' is not supported on this TaskManager instance.", e);
}
}
}
// set the number of samples
String samplesString = parameters.getString(NUM_STATISTICS_SAMPLES, null);
if (samplesString != null) {
try {
setNumLineSamples(Integer.parseInt(samplesString));
}
catch (NumberFormatException e) {
if (LOG.isWarnEnabled()) {
LOG.warn("Invalid value for number of samples to take: " + samplesString + ". Skipping sampling.");
}
setNumLineSamples(0);
}
}
}
@Override
public FileBaseStatistics getStatistics(BaseStatistics cachedStats) throws IOException {
final FileBaseStatistics cachedFileStats = (cachedStats != null && cachedStats instanceof FileBaseStatistics) ?
(FileBaseStatistics) cachedStats : null;
// store properties
final long oldTimeout = this.openTimeout;
final int oldBufferSize = this.bufferSize;
final int oldLineLengthLimit = this.lineLengthLimit;
try {
final Path filePath = this.filePath;
// get the filesystem
final FileSystem fs = FileSystem.get(filePath.toUri());
final ArrayList<FileStatus> allFiles = new ArrayList<FileStatus>(1);
// let the file input format deal with the up-to-date check and the basic size
final FileBaseStatistics stats = getFileStats(cachedFileStats, filePath, fs, allFiles);
if (stats == null) {
return null;
}
// check whether the width per record is already known or the total size is unknown as well
// in both cases, we return the stats as they are
if (stats.getAverageRecordWidth() != FileBaseStatistics.AVG_RECORD_BYTES_UNKNOWN ||
stats.getTotalInputSize() == FileBaseStatistics.SIZE_UNKNOWN) {
return stats;
}
// disabling sampling for unsplittable files since the logic below assumes splitability.
// TODO: Add sampling for unsplittable files. Right now, only compressed text files are affected by this limitation.
if(unsplittable) {
return stats;
}
// compute how many samples to take, depending on the defined upper and lower bound
final int numSamples;
if (this.numLineSamples != NUM_SAMPLES_UNDEFINED) {
numSamples = this.numLineSamples;
} else {
// make the samples small for very small files
final int calcSamples = (int) (stats.getTotalInputSize() / 1024);
numSamples = Math.min(DEFAULT_MAX_NUM_SAMPLES, Math.max(DEFAULT_MIN_NUM_SAMPLES, calcSamples));
}
// check if sampling is disabled.
if (numSamples == 0) {
return stats;
}
if (numSamples < 0) {
throw new RuntimeException("Error: Invalid number of samples: " + numSamples);
}
// make sure that the sampling times out after a while if the file system does not answer in time
this.openTimeout = 10000;
// set a small read buffer size
this.bufferSize = 4 * 1024;
// prevent overly large records, for example if we have an incorrectly configured delimiter
this.lineLengthLimit = MAX_SAMPLE_LEN;
long offset = 0;
long totalNumBytes = 0;
long stepSize = stats.getTotalInputSize() / numSamples;
int fileNum = 0;
int samplesTaken = 0;
// take the samples
while (samplesTaken < numSamples && fileNum < allFiles.size()) {
// make a split for the sample and use it to read a record
FileStatus file = allFiles.get(fileNum);
FileInputSplit split = new FileInputSplit(0, file.getPath(), offset, file.getLen() - offset, null);
// we open the split, read one line, and take its length
try {
open(split);
if (readLine()) {
totalNumBytes += this.currLen + this.delimiter.length;
samplesTaken++;
}
} finally {
// close the file stream, do not release the buffers
super.close();
}
offset += stepSize;
// skip to the next file, if necessary
while (fileNum < allFiles.size() && offset >= (file = allFiles.get(fileNum)).getLen()) {
offset -= file.getLen();
fileNum++;
}
}
// we have the width, store it
return new FileBaseStatistics(stats.getLastModificationTime(),
stats.getTotalInputSize(), totalNumBytes / (float) samplesTaken);
} catch (IOException ioex) {
if (LOG.isWarnEnabled()) {
LOG.warn("Could not determine statistics for file '" + this.filePath + "' due to an io error: "
+ ioex.getMessage());
}
}
catch (Throwable t) {
if (LOG.isErrorEnabled()) {
LOG.error("Unexpected problen while getting the file statistics for file '" + this.filePath + "': "
+ t.getMessage(), t);
}
} finally {
// restore properties (even on return)
this.openTimeout = oldTimeout;
this.bufferSize = oldBufferSize;
this.lineLengthLimit = oldLineLengthLimit;
}
// no statistics possible
return null;
}
/**
* Opens the given input split. This method opens the input stream to the specified file, allocates read buffers
* and positions the stream at the correct position, making sure that any partial record at the beginning is skipped.
*
* @param split The input split to open.
*
* @see eu.stratosphere.api.common.io.FileInputFormat#open(eu.stratosphere.core.fs.FileInputSplit)
*/
@Override
public void open(FileInputSplit split) throws IOException {
super.open(split);
this.bufferSize = this.bufferSize <= 0 ? DEFAULT_READ_BUFFER_SIZE : this.bufferSize;
if (this.readBuffer == null || this.readBuffer.length != this.bufferSize) {
this.readBuffer = new byte[this.bufferSize];
}
if (this.wrapBuffer == null || this.wrapBuffer.length < 256) {
this.wrapBuffer = new byte[256];
}
this.readPos = 0;
this.limit = 0;
this.overLimit = false;
this.end = false;
if (this.splitStart != 0) {
this.stream.seek(this.splitStart);
readLine();
// if the first partial record already pushes the stream over the limit of our split, then no
// record starts within this split
if (this.overLimit) {
this.end = true;
}
} else {
fillBuffer();
}
}
/**
* Checks whether the current split is at its end.
*
* @return True, if the split is at its end, false otherwise.
*/
@Override
public boolean reachedEnd() {
return this.end;
}
@Override
public OT nextRecord(OT record) throws IOException {
if (readLine()) {
return readRecord(record, this.currBuffer, this.currOffset, this.currLen);
} else {
this.end = true;
return null;
}
}
/**
* Closes the input by releasing all buffers and closing the file input stream.
*
* @throws IOException Thrown, if the closing of the file stream causes an I/O error.
*/
@Override
public void close() throws IOException {
this.wrapBuffer = null;
this.readBuffer = null;
super.close();
}
// --------------------------------------------------------------------------------------------
protected final boolean readLine() throws IOException {
if (this.stream == null || this.overLimit) {
return false;
}
int countInWrapBuffer = 0;
/* position of matching positions in the delimiter byte array */
int i = 0;
while (true) {
if (this.readPos >= this.limit) {
if (!fillBuffer()) {
if (countInWrapBuffer > 0) {
setResult(this.wrapBuffer, 0, countInWrapBuffer);
return true;
} else {
return false;
}
}
}
int startPos = this.readPos;
int count = 0;
while (this.readPos < this.limit && i < this.delimiter.length) {
if ((this.readBuffer[this.readPos++]) == this.delimiter[i]) {
i++;
} else {
i = 0;
}
}
// check why we dropped out
if (i == this.delimiter.length) {
// line end
count = this.readPos - startPos - this.delimiter.length;
// copy to byte array
if (countInWrapBuffer > 0) {
// check wrap buffer size
if (this.wrapBuffer.length < countInWrapBuffer + count) {
final byte[] nb = new byte[countInWrapBuffer + count];
System.arraycopy(this.wrapBuffer, 0, nb, 0, countInWrapBuffer);
this.wrapBuffer = nb;
}
if (count >= 0) {
System.arraycopy(this.readBuffer, 0, this.wrapBuffer, countInWrapBuffer, count);
}
setResult(this.wrapBuffer, 0, countInWrapBuffer + count);
return true;
} else {
setResult(this.readBuffer, startPos, count);
return true;
}
} else {
count = this.limit - startPos;
// check against the maximum record length
if ( ((long) countInWrapBuffer) + count > this.lineLengthLimit) {
throw new IOException("The record length exceeded the maximum record length (" +
this.lineLengthLimit + ").");
}
// buffer exhausted
if (this.wrapBuffer.length - countInWrapBuffer < count) {
// reallocate
byte[] tmp = new byte[Math.max(this.wrapBuffer.length * 2, countInWrapBuffer + count)];
System.arraycopy(this.wrapBuffer, 0, tmp, 0, countInWrapBuffer);
this.wrapBuffer = tmp;
}
System.arraycopy(this.readBuffer, startPos, this.wrapBuffer, countInWrapBuffer, count);
countInWrapBuffer += count;
}
}
}
private final void setResult(byte[] buffer, int offset, int len) {
this.currBuffer = buffer;
this.currOffset = offset;
this.currLen = len;
}
private final boolean fillBuffer() throws IOException {
// special case for reading the whole split.
if(this.splitLength == FileInputFormat.READ_WHOLE_SPLIT_FLAG) {
int read = this.stream.read(this.readBuffer, 0, readBuffer.length);
if (read == -1) {
this.stream.close();
this.stream = null;
return false;
} else {
this.readPos = 0;
this.limit = read;
return true;
}
}
// else ..
int toRead = this.splitLength > this.readBuffer.length ? this.readBuffer.length : (int) this.splitLength;
if (this.splitLength <= 0) {
toRead = this.readBuffer.length;
this.overLimit = true;
}
int read = this.stream.read(this.readBuffer, 0, toRead);
if (read == -1) {
this.stream.close();
this.stream = null;
return false;
} else {
this.splitLength -= read;
this.readPos = 0;
this.limit = read;
return true;
}
}
// ============================================================================================
// Parameterization via configuration
// ============================================================================================
// ------------------------------------- Config Keys ------------------------------------------
/**
* The configuration key to set the record delimiter.
*/
protected static final String RECORD_DELIMITER = "delimited-format.delimiter";
/**
* The configuration key to set the record delimiter encoding.
*/
private static final String RECORD_DELIMITER_ENCODING = "delimited-format.delimiter-encoding";
/**
* The configuration key to set the number of samples to take for the statistics.
*/
private static final String NUM_STATISTICS_SAMPLES = "delimited-format.numSamples";
// ----------------------------------- Config Builder -----------------------------------------
/**
* Creates a configuration builder that can be used to set the input format's parameters to the config in a fluent
* fashion.
*
* @return A config builder for setting parameters.
*/
public static ConfigBuilder configureDelimitedFormat(FileDataSourceBase<?> target) {
return new ConfigBuilder(target.getParameters());
}
/**
* Abstract builder used to set parameters to the input format's configuration in a fluent way.
*/
protected static class AbstractConfigBuilder<T> extends FileInputFormat.AbstractConfigBuilder<T> {
private static final String NEWLINE_DELIMITER = "\n";
// --------------------------------------------------------------------
/**
* Creates a new builder for the given configuration.
*
* @param config The configuration into which the parameters will be written.
*/
protected AbstractConfigBuilder(Configuration config) {
super(config);
}
// --------------------------------------------------------------------
/**
* Sets the delimiter to be a single character, namely the given one. The character must be within
* the value range <code>0</code> to <code>127</code>.
*
* @param delimiter The delimiter character.
* @return The builder itself.
*/
public T recordDelimiter(char delimiter) {
if (delimiter == '\n') {
this.config.setString(RECORD_DELIMITER, NEWLINE_DELIMITER);
} else {
this.config.setString(RECORD_DELIMITER, String.valueOf(delimiter));
}
@SuppressWarnings("unchecked")
T ret = (T) this;
return ret;
}
/**
* Sets the delimiter to be the given string. The string will be converted to bytes for more efficient
* comparison during input parsing. The conversion will be done using the platforms default charset.
*
* @param delimiter The delimiter string.
* @return The builder itself.
*/
public T recordDelimiter(String delimiter) {
this.config.setString(RECORD_DELIMITER, delimiter);
@SuppressWarnings("unchecked")
T ret = (T) this;
return ret;
}
/**
* Sets the delimiter to be the given string. The string will be converted to bytes for more efficient
* comparison during input parsing. The conversion will be done using the charset with the given name.
* The charset must be available on the processing nodes, otherwise an exception will be raised at
* runtime.
*
* @param delimiter The delimiter string.
* @param charsetName The name of the encoding character set.
* @return The builder itself.
*/
public T recordDelimiter(String delimiter, String charsetName) {
this.config.setString(RECORD_DELIMITER, delimiter);
this.config.setString(RECORD_DELIMITER_ENCODING, charsetName);
@SuppressWarnings("unchecked")
T ret = (T) this;
return ret;
}
/**
* Sets the number of line samples to take in order to estimate the base statistics for the
* input format.
*
* @param numSamples The number of line samples to take.
* @return The builder itself.
*/
public T numSamplesForStatistics(int numSamples) {
this.config.setInteger(NUM_STATISTICS_SAMPLES, numSamples);
@SuppressWarnings("unchecked")
T ret = (T) this;
return ret;
}
}
/**
* A builder used to set parameters to the input format's configuration in a fluent way.
*/
public static class ConfigBuilder extends AbstractConfigBuilder<ConfigBuilder> {
/**
* Creates a new builder for the given configuration.
*
* @param targetConfig The configuration into which the parameters will be written.
*/
protected ConfigBuilder(Configuration targetConfig) {
super(targetConfig);
}
}
}