/**
* Copyright 2011-2014 Asakusa Framework Team.
*
* Licensed under the Apache License, Version 2.0 (the "License");
* you may not use this file except in compliance with the License.
* You may obtain a copy of the License at
*
* http://www.apache.org/licenses/LICENSE-2.0
*
* Unless required by applicable law or agreed to in writing, software
* distributed under the License is distributed on an "AS IS" BASIS,
* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
* See the License for the specific language governing permissions and
* limitations under the License.
*/
package com.asakusafw.runtime.stage.input;
import java.io.IOException;
import java.util.ArrayList;
import java.util.Arrays;
import java.util.Collections;
import java.util.List;
import org.apache.commons.logging.Log;
import org.apache.commons.logging.LogFactory;
import org.apache.hadoop.conf.Configuration;
import org.apache.hadoop.fs.FSDataInputStream;
import org.apache.hadoop.fs.FileStatus;
import org.apache.hadoop.fs.FileSystem;
import org.apache.hadoop.fs.Path;
import org.apache.hadoop.io.NullWritable;
import org.apache.hadoop.io.Writable;
import org.apache.hadoop.mapreduce.InputFormat;
import org.apache.hadoop.mapreduce.InputSplit;
import org.apache.hadoop.mapreduce.Job;
import org.apache.hadoop.mapreduce.JobContext;
import org.apache.hadoop.mapreduce.RecordReader;
import org.apache.hadoop.mapreduce.TaskAttemptContext;
import org.apache.hadoop.mapreduce.lib.input.FileInputFormat;
import org.apache.hadoop.mapreduce.lib.input.FileSplit;
import org.apache.hadoop.util.ReflectionUtils;
import com.asakusafw.runtime.directio.DirectInputFragment;
import com.asakusafw.runtime.directio.hadoop.BlockInfo;
import com.asakusafw.runtime.directio.hadoop.BlockMap;
import com.asakusafw.runtime.stage.StageInput;
import com.asakusafw.runtime.stage.temporary.TemporaryFile;
import com.asakusafw.runtime.stage.temporary.TemporaryFileInput;
/**
* A temporary input format.
* @param <T> data type
* @since 0.2.5
* @version 0.7.0
*/
public final class TemporaryInputFormat<T> extends InputFormat<NullWritable, T> {
static final Log LOG = LogFactory.getLog(TemporaryInputFormat.class);
static final String KEY_DEFAULT_SPLIT_SIZE = "com.asakusafw.stage.input.temporary.blockSize";
static final long DEFAULT_SPLIT_SIZE = 128L * 1024 * 1024;
@Override
public List<InputSplit> getSplits(JobContext context) throws IOException, InterruptedException {
return getSplits(context.getConfiguration(), getInputPaths(context));
}
/**
* Computes and returns splits for the specified inputs.
* @param context current job context
* @param inputList target input list
* @return the computed splits
* @throws IOException if failed to compute splits
* @throws InterruptedException if interrupted while computing inputs
* @throws IllegalArgumentException if some parameters were {@code null}
*/
public List<InputSplit> getSplits(
JobContext context,
List<StageInput> inputList) throws IOException, InterruptedException {
if (context == null) {
throw new IllegalArgumentException("context must not be null"); //$NON-NLS-1$
}
if (inputList == null) {
throw new IllegalArgumentException("inputList must not be null"); //$NON-NLS-1$
}
List<Path> paths = new ArrayList<Path>();
for (StageInput input : inputList) {
paths.add(new Path(input.getPathString()));
}
return getSplits(context.getConfiguration(), paths);
}
private List<InputSplit> getSplits(Configuration configuration, List<Path> paths) throws IOException {
long splitSize = configuration.getLong(KEY_DEFAULT_SPLIT_SIZE, DEFAULT_SPLIT_SIZE);
List<InputSplit> results = new ArrayList<InputSplit>();
for (Path path : paths) {
FileSystem fs = path.getFileSystem(configuration);
FileStatus[] statuses = fs.globStatus(path);
if (statuses == null) {
continue;
}
for (FileStatus status : statuses) {
BlockMap blockMap = BlockMap.create(
status.getPath().toString(),
status.getLen(),
BlockMap.computeBlocks(fs, status),
false);
results.addAll(computeSplits(status.getPath(), blockMap, splitSize));
}
}
return results;
}
/**
* Compute input splits for the target file.
* @param path the target file path
* @param blockMap the file block map
* @param splitSize the expected split size, or {@code <= 0} to prevent splits
* @return the computed input splits for the file
*/
static List<FileSplit> computeSplits(Path path, BlockMap blockMap, long splitSize) {
long align = splitSize;
if (splitSize > 0) {
long remain = splitSize % TemporaryFile.BLOCK_SIZE;
if (remain != 0) {
align += TemporaryFile.BLOCK_SIZE - remain;
}
}
long size = blockMap.getFileSize();
long start = 0;
List<FileSplit> results = new ArrayList<FileSplit>();
for (BlockInfo block : blockMap.getBlocks()) {
assert start % TemporaryFile.BLOCK_SIZE == 0;
long end = block.getEnd();
if (end < start) {
continue;
}
long remain = end % TemporaryFile.BLOCK_SIZE;
if (remain != 0) {
end = Math.min(size, end + (TemporaryFile.BLOCK_SIZE - remain));
}
results.addAll(createSplits(path, blockMap, start, end, align));
start = end;
}
return results;
}
private static List<FileSplit> createSplits(
Path path, BlockMap blockMap, long start, long end, long splitSize) {
if (start >= end) {
return Collections.emptyList();
}
if (splitSize <= 0) {
FileSplit split = getSplit(blockMap, path, start, end);
return Collections.singletonList(split);
}
long threashold = (long) (splitSize * 1.2);
List<FileSplit> results = new ArrayList<FileSplit>();
long current = start;
while (current < end) {
long next;
if (end - current < threashold) {
next = end;
} else {
next = current + splitSize;
}
FileSplit split = getSplit(blockMap, path, current, next);
results.add(split);
current = next;
}
return results;
}
private static FileSplit getSplit(BlockMap blockMap, Path path, long start, long end) {
DirectInputFragment f = blockMap.get(start, end);
List<String> owners = f.getOwnerNodeNames();
FileSplit split = new FileSplit(
path, start, end - start,
owners.toArray(new String[owners.size()]));
return split;
}
/**
* Configures input paths.
* @param job current job
* @param paths source paths
* @throws IOException if failed to resolve paths
* @throws IllegalArgumentException if some parameters were {@code null}
*/
public static void setInputPaths(Job job, List<Path> paths) throws IOException {
if (job == null) {
throw new IllegalArgumentException("job must not be null"); //$NON-NLS-1$
}
if (paths == null) {
throw new IllegalArgumentException("paths must not be null"); //$NON-NLS-1$
}
FileInputFormat.setInputPaths(job, paths.toArray(new Path[paths.size()]));
}
/**
* Returns input paths.
* @param context current job
* @return the input paths, or an empty list if they are not set
* @throws IOException if failed to resolve paths
* @throws IllegalArgumentException if some parameters were {@code null}
* @since 0.7.0
*/
public static List<Path> getInputPaths(JobContext context) throws IOException {
if (context == null) {
throw new IllegalArgumentException("job must not be null"); //$NON-NLS-1$
}
Path[] paths = FileInputFormat.getInputPaths(context);
if (paths == null || paths.length == 0) {
return Collections.emptyList();
}
return Arrays.asList(paths);
}
@Override
public RecordReader<NullWritable, T> createRecordReader(
InputSplit split,
TaskAttemptContext context) throws IOException, InterruptedException {
FileSplit s = (FileSplit) split;
assert s.getStart() % TemporaryFile.BLOCK_SIZE == 0;
assert s.getStart() > 0 || s.getLength() > 0;
return createRecordReader();
}
/**
* Create a record reader for this input format.
* @param <T> the value type
* @return the record reader
*/
@SuppressWarnings("unchecked")
static <T> RecordReader<NullWritable, T> createRecordReader() {
return (RecordReader<NullWritable, T>) new Reader<Writable>();
}
private static final class Reader<T extends Writable> extends RecordReader<NullWritable, T> {
private long size;
private TemporaryFileInput<T> input;
private T value;
public Reader() {
return;
}
@SuppressWarnings("unchecked")
@Override
public void initialize(InputSplit split, TaskAttemptContext context) throws IOException, InterruptedException {
FileSplit s = (FileSplit) split;
this.size = s.getLength();
Path path = s.getPath();
FileSystem fs = path.getFileSystem(context.getConfiguration());
int blocks = computeBlocks(s);
FSDataInputStream stream = fs.open(path);
boolean succeed = false;
try {
if (s.getStart() != 0) {
assert s.getStart() % TemporaryFile.BLOCK_SIZE == 0;
stream.seek(s.getStart());
}
this.input = (TemporaryFileInput<T>) new TemporaryFileInput<Writable>(stream, blocks);
Class<?> aClass = context.getConfiguration().getClassByName(input.getDataTypeName());
this.value = (T) ReflectionUtils.newInstance(aClass, context.getConfiguration());
succeed = true;
} catch (ClassNotFoundException e) {
throw new IOException(e);
} finally {
if (succeed == false) {
stream.close();
}
}
}
private int computeBlocks(FileSplit s) {
long length = s.getLength() + TemporaryFile.BLOCK_SIZE - 1;
return (int) (length / TemporaryFile.BLOCK_SIZE);
}
@Override
public boolean nextKeyValue() throws IOException, InterruptedException {
return input.readTo(value);
}
@Override
public NullWritable getCurrentKey() throws IOException, InterruptedException {
return NullWritable.get();
}
@Override
public T getCurrentValue() throws IOException, InterruptedException {
return value;
}
@Override
public float getProgress() throws IOException, InterruptedException {
long current = input.getCurrentBlock() * (long) TemporaryFile.BLOCK_SIZE;
current += input.getPositionInBlock();
return (float) current / size;
}
@Override
public void close() throws IOException {
if (input != null) {
input.close();
}
}
}
}