/*
* Copyright 2012 NGDATA nv
*
* Licensed under the Apache License, Version 2.0 (the "License");
* you may not use this file except in compliance with the License.
* You may obtain a copy of the License at
*
* http://www.apache.org/licenses/LICENSE-2.0
*
* Unless required by applicable law or agreed to in writing, software
* distributed under the License is distributed on an "AS IS" BASIS,
* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
* See the License for the specific language governing permissions and
* limitations under the License.
*/
package org.lilyproject.mapreduce;
import java.io.IOException;
import java.util.ArrayList;
import java.util.List;
import java.util.Map;
import org.lilyproject.util.hbase.RepoAndTableUtil;
import com.google.common.collect.Lists;
import com.google.common.collect.Maps;
import org.apache.commons.logging.Log;
import org.apache.commons.logging.LogFactory;
import org.apache.hadoop.conf.Configurable;
import org.apache.hadoop.conf.Configuration;
import org.apache.hadoop.hbase.client.HConnectionManager;
import org.apache.hadoop.hbase.client.HTable;
import org.apache.hadoop.hbase.mapreduce.TableSplit;
import org.apache.hadoop.hbase.util.Bytes;
import org.apache.hadoop.hbase.util.Pair;
import org.apache.hadoop.mapreduce.InputFormat;
import org.apache.hadoop.mapreduce.InputSplit;
import org.apache.hadoop.mapreduce.JobContext;
import org.codehaus.jackson.JsonNode;
import org.lilyproject.client.LilyClient;
import org.lilyproject.repository.api.LRepository;
import org.lilyproject.repository.api.RecordScan;
import org.lilyproject.repository.api.RepositoryException;
import org.lilyproject.repository.api.RepositoryTable;
import org.lilyproject.repository.api.TableManager;
import org.lilyproject.tools.import_.json.RecordScanReader;
import org.lilyproject.util.exception.ExceptionUtil;
import org.lilyproject.util.io.Closer;
import org.lilyproject.util.json.JsonFormat;
import org.lilyproject.util.zookeeper.ZkConnectException;
import org.lilyproject.util.zookeeper.ZkUtil;
import org.lilyproject.util.zookeeper.ZooKeeperItf;
/**
* A MapReduce InputFormat for Lily based on Lily scanners.
*/
public abstract class AbstractLilyScanInputFormat<KEYIN, VALUEIN> extends InputFormat<KEYIN, VALUEIN> implements Configurable {
public static final String SCAN = "lily.mapreduce.scan";
final Log log = LogFactory.getLog(AbstractLilyScanInputFormat.class);
private Configuration conf;
protected String zkConnectString;
protected String repositoryName;
@Override
public void setConf(Configuration conf) {
this.conf = conf;
zkConnectString = conf.get(LilyMapReduceUtil.ZK_CONNECT_STRING);
if (zkConnectString == null) {
log.warn("ZooKeeper connection string not specified, will use 'localhost'.");
zkConnectString = "localhost";
}
repositoryName = conf.get(LilyMapReduceUtil.REPOSITORY_NAME);
if (repositoryName == null) {
throw new RuntimeException("Missing " + LilyMapReduceUtil.REPOSITORY_NAME + " in job conf.");
}
}
@Override
public Configuration getConf() {
return conf;
}
@Override
public List<InputSplit> getSplits(JobContext jobContext) throws IOException, InterruptedException {
ZooKeeperItf zk = null;
LilyClient lilyClient = null;
Configuration hbaseConf = null;
List<InputSplit> inputSplits = Lists.newArrayList();
try {
zk = ZkUtil.connect(zkConnectString, 30000);
// Need connection to Lily to parse RecordScan (a bit lame)
lilyClient = null;
try {
lilyClient = new LilyClient(zk);
} catch (Exception e) {
throw new IOException("Error setting up LilyClient", e);
}
LRepository repository = lilyClient.getRepository(repositoryName);
RecordScan scan = getScan(repository);
// Determine start and stop row
byte[] startRow;
if (scan.getRawStartRecordId() != null) {
startRow = scan.getRawStartRecordId();
} else if (scan.getStartRecordId() != null) {
startRow = scan.getStartRecordId().toBytes();
} else {
startRow = new byte[0];
}
byte[] stopRow;
if (scan.getRawStopRecordId() != null) {
stopRow = scan.getRawStopRecordId();
} else if (scan.getStopRecordId() != null) {
stopRow = scan.getStopRecordId().toBytes();
} else {
stopRow = new byte[0];
}
//
hbaseConf = LilyClient.getHBaseConfiguration(zk);
for (String tableName : getRepositoryHBaseTableNames(repository.getTableManager(),
jobContext.getConfiguration())) {
HTable table = new HTable(hbaseConf, tableName);
try {
inputSplits.addAll(getSplits(table, startRow, stopRow));
} finally {
Closer.close(table);
}
}
return inputSplits;
} catch (ZkConnectException e) {
throw new IOException("Error setting up splits", e);
} catch (RepositoryException e) {
throw new IOException("Error setting up splits", e);
} finally {
Closer.close(zk);
if (hbaseConf != null) {
HConnectionManager.deleteConnection(hbaseConf, true);
}
Closer.close(lilyClient);
}
}
/**
* Returns the HBase table names of the repository tables to be included in this job.
*/
private List<String> getRepositoryHBaseTableNames(TableManager tableManager, Configuration conf)
throws InterruptedException, IOException {
// key is lily table name, value is hbase table name (= the name including the repository name prefix)
Map<String, String> allRepoTables = Maps.newHashMap();
for (RepositoryTable repoTable : tableManager.getTables()) {
allRepoTables.put(repoTable.getName(),
RepoAndTableUtil.getHBaseTableName(repoTable.getRepositoryName(), repoTable.getName()));
}
List<String> tablesToInclude = Lists.newArrayList();
String tableListStr = conf.get(LilyMapReduceUtil.REPOSITORY_TABLES);
if (tableListStr == null) {
tablesToInclude.addAll(allRepoTables.values());
} else {
for (String toInclude : tableListStr.split(",")) {
if (!allRepoTables.containsKey(toInclude)) {
throw new IllegalArgumentException(String.format("'%s' is not a repository table", toInclude));
}
tablesToInclude.add(allRepoTables.get(toInclude));
}
}
return tablesToInclude;
}
/**
* License note: this code was copied from HBase's TableInputFormat.
*
* @param startRow start row of the scan
* @param stopRow stop row of the scan
*/
public List<InputSplit> getSplits(HTable table, final byte[] startRow, final byte[] stopRow) throws IOException {
if (table == null) {
throw new IOException("No table was provided.");
}
Pair<byte[][], byte[][]> keys = table.getStartEndKeys();
if (keys == null || keys.getFirst() == null ||
keys.getFirst().length == 0) {
throw new IOException("Expecting at least one region.");
}
int count = 0;
List<InputSplit> splits = new ArrayList<InputSplit>(keys.getFirst().length);
for (int i = 0; i < keys.getFirst().length; i++) {
if ( !includeRegionInSplit(keys.getFirst()[i], keys.getSecond()[i])) {
continue;
}
String regionLocation = table.getRegionLocation(keys.getFirst()[i]).
getServerAddress().getHostname();
// determine if the given start an stop key fall into the region
if ((startRow.length == 0 || keys.getSecond()[i].length == 0 ||
Bytes.compareTo(startRow, keys.getSecond()[i]) < 0) &&
(stopRow.length == 0 ||
Bytes.compareTo(stopRow, keys.getFirst()[i]) > 0)) {
byte[] splitStart = startRow.length == 0 ||
Bytes.compareTo(keys.getFirst()[i], startRow) >= 0 ?
keys.getFirst()[i] : startRow;
byte[] splitStop = (stopRow.length == 0 ||
Bytes.compareTo(keys.getSecond()[i], stopRow) <= 0) &&
keys.getSecond()[i].length > 0 ?
keys.getSecond()[i] : stopRow;
InputSplit split = new TableSplit(table.getTableName(),
splitStart, splitStop, regionLocation);
splits.add(split);
if (log.isDebugEnabled()) {
log.debug("getSplits: split -> " + (count++) + " -> " + split);
}
}
}
return splits;
}
protected boolean includeRegionInSplit(final byte[] startKey, final byte [] endKey) {
return true;
}
protected RecordScan getScan(LRepository repository) {
RecordScan scan;
String scanData = conf.get(SCAN);
if (scanData != null) {
try {
JsonNode node = JsonFormat.deserializeNonStd(scanData);
scan = RecordScanReader.INSTANCE.fromJson(node, repository);
} catch (Exception e) {
ExceptionUtil.handleInterrupt(e);
throw new RuntimeException(e);
}
} else {
scan = new RecordScan();
}
return scan;
}
}