/**
* This software is licensed to you under the Apache License, Version 2.0 (the
* "Apache License").
*
* LinkedIn's contributions are made under the Apache License. If you contribute
* to the Software, the contributions will be deemed to have been made under the
* Apache License, unless you expressly indicate otherwise. Please do not make any
* contributions that would be inconsistent with the Apache License.
*
* You may obtain a copy of the Apache License at http://www.apache.org/licenses/LICENSE-2.0
* Unless required by applicable law or agreed to in writing, this software
* distributed under the Apache License is distributed on an "AS IS" BASIS, WITHOUT
* WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. See the Apache
* License for the specific language governing permissions and limitations for the
* software governed under the Apache License.
*
* © 2012 LinkedIn Corp. All Rights Reserved.
*/
package com.senseidb.indexing.hadoop.job;
import java.io.IOException;
import java.net.URI;
import java.net.URISyntaxException;
import java.text.NumberFormat;
import java.util.Arrays;
import org.apache.hadoop.conf.Configuration;
import org.apache.hadoop.conf.Configured;
import org.apache.hadoop.filecache.DistributedCache;
import org.apache.hadoop.fs.FileStatus;
import org.apache.hadoop.fs.FileSystem;
import org.apache.hadoop.fs.Path;
import org.apache.hadoop.fs.Trash;
import org.apache.hadoop.io.Text;
import org.apache.hadoop.mapred.FileInputFormat;
import org.apache.hadoop.mapred.FileOutputFormat;
import org.apache.hadoop.mapred.InputFormat;
import org.apache.hadoop.mapred.JobConf;
import org.apache.hadoop.mapred.TextInputFormat;
import org.apache.hadoop.util.StringUtils;
import org.apache.log4j.Logger;
import com.senseidb.indexing.hadoop.keyvalueformat.IntermediateForm;
import com.senseidb.indexing.hadoop.keyvalueformat.Shard;
import com.senseidb.indexing.hadoop.map.SenseiMapper;
import com.senseidb.indexing.hadoop.reduce.FileSystemDirectory;
import com.senseidb.indexing.hadoop.reduce.IndexUpdateOutputFormat;
import com.senseidb.indexing.hadoop.reduce.SenseiCombiner;
import com.senseidb.indexing.hadoop.reduce.SenseiReducer;
import com.senseidb.indexing.hadoop.util.LuceneUtil;
import com.senseidb.indexing.hadoop.util.MRConfig;
import com.senseidb.indexing.hadoop.util.MRJobConfig;
import com.senseidb.indexing.hadoop.util.SenseiJobConfig;
public class MapReduceJob extends Configured {
private static final NumberFormat NUMBER_FORMAT = NumberFormat.getInstance();
private static final Logger logger = Logger.getLogger(MapReduceJob.class);
public JobConf createJob(Class MRClass) throws IOException, URISyntaxException {
Configuration conf = getConf();
Path[] inputPaths;
Path outputPath;
Shard[] shards = null;
int numMapTasks = conf.getInt(MRJobConfig.NUM_MAPS, 2);
int numShards = conf.getInt(SenseiJobConfig.NUM_SHARDS, 2);
// inputPaths = FileInputFormat.getInputPaths(jobConf);
String dirs = conf.get(SenseiJobConfig.INPUT_DIRS, null);
logger.info("dirs:"+ dirs);
String [] list = StringUtils.split(dirs);
logger.info("length after split:"+ list.length);
inputPaths = new Path[list.length];
for (int i = 0; i < list.length; i++) {
inputPaths[i] = new Path(StringUtils.unEscapeString(list[i]));
}
logger.info("path[0] is:" + inputPaths[0]);
outputPath = new Path(conf.get(SenseiJobConfig.OUTPUT_DIR));
String indexPath = conf.get(SenseiJobConfig.INDEX_PATH);
String indexSubDirPrefix = conf.get(SenseiJobConfig.INDEX_SUBDIR_PREFIX, "");
shards = createShards(indexPath, numShards, conf, indexSubDirPrefix);
FileSystem fs = FileSystem.get(conf);
String username = conf.get("hadoop.job.ugi");
if (fs.exists(outputPath) && conf.getBoolean(SenseiJobConfig.FORCE_OUTPUT_OVERWRITE, false))
fs.delete(outputPath, true);
if (fs.exists(new Path(indexPath)) && conf.getBoolean(SenseiJobConfig.FORCE_OUTPUT_OVERWRITE, false))
fs.delete(new Path(indexPath), true);
// set the starting generation for each shard
// when a reduce task fails, a new reduce task
// has to know where to re-start
setShardGeneration(conf, shards);
Shard.setIndexShards(conf, shards);
// MapTask.MapOutputBuffer uses JobContext.IO_SORT_MB to decide its max buffer size
// (max buffer size = 1/2 * JobContext.IO_SORT_MB).
// Here we half-en JobContext.IO_SORT_MB because we use the other half memory to
// build an intermediate form/index in Combiner.
conf.setInt(MRJobConfig.IO_SORT_MB, conf.getInt(MRJobConfig.IO_SORT_MB, 100) / 2);
// set the temp dir for the job;
conf.set(MRConfig.TEMP_DIR, "${mapred.child.tmp}/hindex/");
if (fs.exists(new Path(conf.get(MRConfig.TEMP_DIR))))
fs.delete(new Path(conf.get(MRConfig.TEMP_DIR)), true);
if(fs.exists(new Path("./tmp")))
fs.delete(new Path("./tmp"), true);
(new Trash(conf)).expunge(); //empty trash;
//always using compound file format to speed up;
conf.setBoolean(SenseiJobConfig.USE_COMPOUND_FILE, true);
String schemaFile = conf.get(SenseiJobConfig.SCHEMA_FILE_URL);
if(schemaFile == null)
throw new IOException("no schema file is found");
else{
logger.info("Adding schema file: " + conf.get(SenseiJobConfig.SCHEMA_FILE_URL));
DistributedCache.addCacheFile(new URI(schemaFile), conf);
}
// create the job configuration
JobConf jobConf = new JobConf(conf, MRClass);
if(jobConf.getJobName().length()<1)
jobConf.setJobName(MRClass.getName() + "_"+ System.currentTimeMillis());
// provided by application
FileInputFormat.setInputPaths(jobConf, inputPaths);
FileOutputFormat.setOutputPath(jobConf, outputPath);
jobConf.setNumMapTasks(numMapTasks);
// already set shards
jobConf.setNumReduceTasks(shards.length);
jobConf.setInputFormat(
conf.getClass(SenseiJobConfig.INPUT_FORMAT, TextInputFormat.class, InputFormat.class));
Path[] inputs = FileInputFormat.getInputPaths(jobConf);
StringBuilder buffer = new StringBuilder(inputs[0].toString());
for (int i = 1; i < inputs.length; i++) {
buffer.append(",");
buffer.append(inputs[i].toString());
}
logger.info("mapred.input.dir = " + buffer.toString());
logger.info("mapreduce.output.fileoutputformat.outputdir = " +
FileOutputFormat.getOutputPath(jobConf).toString());
logger.info("mapreduce.job.maps = " + jobConf.getNumMapTasks());
logger.info("mapreduce.job.reduces = " + jobConf.getNumReduceTasks());
logger.info(shards.length + " shards = " + conf.get(SenseiJobConfig.INDEX_SHARDS));
logger.info("mapred.input.format.class = "
+ jobConf.getInputFormat().getClass().getName());
logger.info("mapreduce.cluster.temp.dir = " + jobConf.get(MRConfig.TEMP_DIR));
// set by the system
jobConf.setMapOutputKeyClass(Shard.class);
jobConf.setMapOutputValueClass(IntermediateForm.class);
jobConf.setOutputKeyClass(Shard.class);
jobConf.setOutputValueClass(Text.class);
jobConf.setMapperClass(SenseiMapper.class);
// no need for the partitioner.class;
jobConf.setCombinerClass(SenseiCombiner.class);
jobConf.setReducerClass(SenseiReducer.class);
jobConf.setOutputFormat(IndexUpdateOutputFormat.class);
jobConf.setReduceSpeculativeExecution(false);
return jobConf;
}
private static FileSystem getFileSystem(String user) {
Configuration conf = new Configuration();
conf.set("hadoop.job.ugi", user);
try
{
return FileSystem.get(conf);
}
catch(IOException e)
{
throw new RuntimeException(e);
}
}
private static Shard[] createShards(String indexPath, int numShards,
org.apache.hadoop.conf.Configuration conf, String indexSubDirPrefix) throws IOException {
String parent = Shard.normalizePath(indexPath) + Path.SEPARATOR;
long versionNumber = -1;
long generation = -1;
FileSystem fs = FileSystem.get(conf);
Path path = new Path(indexPath);
if (fs.exists(path)) {
FileStatus[] fileStatus = fs.listStatus(path);
String[] shardNames = new String[fileStatus.length];
int count = 0;
for (int i = 0; i < fileStatus.length; i++) {
if (fileStatus[i].isDir()) {
shardNames[count] = fileStatus[i].getPath().getName();
count++;
}
}
Arrays.sort(shardNames, 0, count);
Shard[] shards = new Shard[count >= numShards ? count : numShards];
for (int i = 0; i < count; i++) {
shards[i] =
new Shard(versionNumber, parent + shardNames[i], generation);
}
int number = count;
for (int i = count; i < numShards; i++) {
String shardPath;
while (true) {
shardPath = parent + indexSubDirPrefix + NUMBER_FORMAT.format(number++);
if (!fs.exists(new Path(shardPath))) {
break;
}
}
shards[i] = new Shard(versionNumber, shardPath, generation);
}
return shards;
} else {
Shard[] shards = new Shard[numShards];
for (int i = 0; i < shards.length; i++) {
shards[i] =
new Shard(versionNumber, parent + indexSubDirPrefix + NUMBER_FORMAT.format(i),
generation);
}
return shards;
}
}
void setShardGeneration(Configuration conf, Shard[] shards)
throws IOException {
FileSystem fs = FileSystem.get(conf);
for (int i = 0; i < shards.length; i++) {
Path path = new Path(shards[i].getDirectory());
long generation = -1;
if (fs.exists(path)) {
FileSystemDirectory dir = null;
try {
dir = new FileSystemDirectory(fs, path, false, conf);
generation = LuceneUtil.getCurrentSegmentGeneration(dir);
} finally {
if (dir != null) {
dir.close();
}
}
}
if (generation != shards[i].getGeneration()) {
// set the starting generation for the shard
shards[i] =
new Shard(shards[i].getVersion(), shards[i].getDirectory(),
generation);
}
}
}
}