Source Code of org.lilyproject.mapreduce.AbstractLilyScanInputFormat

/*
 * Copyright 2012 NGDATA nv
 *
 * Licensed under the Apache License, Version 2.0 (the "License");
 * you may not use this file except in compliance with the License.
 * You may obtain a copy of the License at
 *
 *     http://www.apache.org/licenses/LICENSE-2.0
 *
 * Unless required by applicable law or agreed to in writing, software
 * distributed under the License is distributed on an "AS IS" BASIS,
 * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 * See the License for the specific language governing permissions and
 * limitations under the License.
 */
package org.lilyproject.mapreduce;


import java.io.IOException;
import java.util.ArrayList;
import java.util.List;
import java.util.Map;


import org.lilyproject.util.hbase.RepoAndTableUtil;




import com.google.common.collect.Lists;
import com.google.common.collect.Maps;
import org.apache.commons.logging.Log;
import org.apache.commons.logging.LogFactory;
import org.apache.hadoop.conf.Configurable;
import org.apache.hadoop.conf.Configuration;
import org.apache.hadoop.hbase.client.HConnectionManager;
import org.apache.hadoop.hbase.client.HTable;
import org.apache.hadoop.hbase.mapreduce.TableSplit;
import org.apache.hadoop.hbase.util.Bytes;
import org.apache.hadoop.hbase.util.Pair;
import org.apache.hadoop.mapreduce.InputFormat;
import org.apache.hadoop.mapreduce.InputSplit;
import org.apache.hadoop.mapreduce.JobContext;
import org.codehaus.jackson.JsonNode;
import org.lilyproject.client.LilyClient;
import org.lilyproject.repository.api.LRepository;
import org.lilyproject.repository.api.RecordScan;
import org.lilyproject.repository.api.RepositoryException;
import org.lilyproject.repository.api.RepositoryTable;
import org.lilyproject.repository.api.TableManager;
import org.lilyproject.tools.import_.json.RecordScanReader;
import org.lilyproject.util.exception.ExceptionUtil;
import org.lilyproject.util.io.Closer;
import org.lilyproject.util.json.JsonFormat;
import org.lilyproject.util.zookeeper.ZkConnectException;
import org.lilyproject.util.zookeeper.ZkUtil;
import org.lilyproject.util.zookeeper.ZooKeeperItf;


/**
 * A MapReduce InputFormat for Lily based on Lily scanners.
 */
public abstract class AbstractLilyScanInputFormat<KEYIN, VALUEIN> extends InputFormat<KEYIN, VALUEIN> implements Configurable {


    public static final String SCAN = "lily.mapreduce.scan";


    final Log log = LogFactory.getLog(AbstractLilyScanInputFormat.class);


    private Configuration conf;
    protected String zkConnectString;
    protected String repositoryName;


    @Override
    public void setConf(Configuration conf) {
        this.conf = conf;
        zkConnectString = conf.get(LilyMapReduceUtil.ZK_CONNECT_STRING);
        if (zkConnectString == null) {
            log.warn("ZooKeeper connection string not specified, will use 'localhost'.");
            zkConnectString = "localhost";
        }
        repositoryName = conf.get(LilyMapReduceUtil.REPOSITORY_NAME);
        if (repositoryName == null) {
            throw new RuntimeException("Missing " + LilyMapReduceUtil.REPOSITORY_NAME + " in job conf.");
        }
    }


    @Override
    public Configuration getConf() {
        return conf;
    }


    @Override
    public List<InputSplit> getSplits(JobContext jobContext) throws IOException, InterruptedException {
        ZooKeeperItf zk = null;
        LilyClient lilyClient = null;
        Configuration hbaseConf = null;
        List<InputSplit> inputSplits = Lists.newArrayList();
        try {
            zk = ZkUtil.connect(zkConnectString, 30000);


            // Need connection to Lily to parse RecordScan (a bit lame)
            lilyClient = null;
            try {
                lilyClient = new LilyClient(zk);
            } catch (Exception e) {
                throw new IOException("Error setting up LilyClient", e);
            }


            LRepository repository = lilyClient.getRepository(repositoryName);
            RecordScan scan = getScan(repository);


            // Determine start and stop row
            byte[] startRow;
            if (scan.getRawStartRecordId() != null) {
                startRow = scan.getRawStartRecordId();
            } else if (scan.getStartRecordId() != null) {
                startRow = scan.getStartRecordId().toBytes();
            } else {
                startRow = new byte[0];
            }


            byte[] stopRow;
            if (scan.getRawStopRecordId() != null) {
                stopRow = scan.getRawStopRecordId();
            } else if (scan.getStopRecordId() != null) {
                stopRow = scan.getStopRecordId().toBytes();
            } else {
                stopRow = new byte[0];
            }


            //
            hbaseConf = LilyClient.getHBaseConfiguration(zk);


            for (String tableName : getRepositoryHBaseTableNames(repository.getTableManager(),
                    jobContext.getConfiguration())) {
                HTable table = new HTable(hbaseConf, tableName);
                try {
                    inputSplits.addAll(getSplits(table, startRow, stopRow));
                } finally {
                    Closer.close(table);
                }
            }
            return inputSplits;
        } catch (ZkConnectException e) {
            throw new IOException("Error setting up splits", e);
        } catch (RepositoryException e) {
            throw new IOException("Error setting up splits", e);
        } finally {
            Closer.close(zk);
            if (hbaseConf != null) {
                HConnectionManager.deleteConnection(hbaseConf, true);
            }
            Closer.close(lilyClient);
        }
    }


    /**
     * Returns the HBase table names of the repository tables to be included in this job.
     */
    private List<String> getRepositoryHBaseTableNames(TableManager tableManager, Configuration conf)
            throws InterruptedException, IOException {
        // key is lily table name, value is hbase table name (= the name including the repository name prefix)
        Map<String, String> allRepoTables = Maps.newHashMap();
        for (RepositoryTable repoTable : tableManager.getTables()) {
            allRepoTables.put(repoTable.getName(),
                    RepoAndTableUtil.getHBaseTableName(repoTable.getRepositoryName(), repoTable.getName()));
        }


        List<String> tablesToInclude = Lists.newArrayList();
        String tableListStr = conf.get(LilyMapReduceUtil.REPOSITORY_TABLES);
        if (tableListStr == null) {
            tablesToInclude.addAll(allRepoTables.values());
        } else {
            for (String toInclude : tableListStr.split(",")) {
                if (!allRepoTables.containsKey(toInclude)) {
                    throw new IllegalArgumentException(String.format("'%s' is not a repository table", toInclude));
                }
                tablesToInclude.add(allRepoTables.get(toInclude));
            }
        }
        return tablesToInclude;


    }


    /**
     * License note: this code was copied from HBase's TableInputFormat.
     *
     * @param startRow start row of the scan
     * @param stopRow stop row of the scan
     */
    public List<InputSplit> getSplits(HTable table, final byte[] startRow, final byte[] stopRow) throws IOException {
        if (table == null) {
            throw new IOException("No table was provided.");
        }
        Pair<byte[][], byte[][]> keys = table.getStartEndKeys();
        if (keys == null || keys.getFirst() == null ||
                keys.getFirst().length == 0) {
            throw new IOException("Expecting at least one region.");
        }
        int count = 0;
        List<InputSplit> splits = new ArrayList<InputSplit>(keys.getFirst().length);
        for (int i = 0; i < keys.getFirst().length; i++) {
            if ( !includeRegionInSplit(keys.getFirst()[i], keys.getSecond()[i])) {
                continue;
            }
            String regionLocation = table.getRegionLocation(keys.getFirst()[i]).
                    getServerAddress().getHostname();
            // determine if the given start an stop key fall into the region
            if ((startRow.length == 0 || keys.getSecond()[i].length == 0 ||
                    Bytes.compareTo(startRow, keys.getSecond()[i]) < 0) &&
                    (stopRow.length == 0 ||
                            Bytes.compareTo(stopRow, keys.getFirst()[i]) > 0)) {
                byte[] splitStart = startRow.length == 0 ||
                        Bytes.compareTo(keys.getFirst()[i], startRow) >= 0 ?
                        keys.getFirst()[i] : startRow;
                byte[] splitStop = (stopRow.length == 0 ||
                        Bytes.compareTo(keys.getSecond()[i], stopRow) <= 0) &&
                        keys.getSecond()[i].length > 0 ?
                        keys.getSecond()[i] : stopRow;
                InputSplit split = new TableSplit(table.getTableName(),
                        splitStart, splitStop, regionLocation);
                splits.add(split);
                if (log.isDebugEnabled()) {
                    log.debug("getSplits: split -> " + (count++) + " -> " + split);
                }
            }
        }
        return splits;
    }


    protected boolean includeRegionInSplit(final byte[] startKey, final byte [] endKey) {
        return true;
    }


    protected RecordScan getScan(LRepository repository) {
        RecordScan scan;
        String scanData = conf.get(SCAN);
        if (scanData != null) {
            try {
                JsonNode node = JsonFormat.deserializeNonStd(scanData);
                scan = RecordScanReader.INSTANCE.fromJson(node, repository);
            } catch (Exception e) {
                ExceptionUtil.handleInterrupt(e);
                throw new RuntimeException(e);
            }
        } else {
            scan = new RecordScan();
        }
        return scan;
    }
}
Source Code of org.lilyproject.mapreduce.AbstractLilyScanInputFormat

Related Classes of org.lilyproject.mapreduce.AbstractLilyScanInputFormat