Package org.lilyproject.mapreduce

Source Code of org.lilyproject.mapreduce.AbstractLilyScanInputFormat

/*
* Copyright 2012 NGDATA nv
*
* Licensed under the Apache License, Version 2.0 (the "License");
* you may not use this file except in compliance with the License.
* You may obtain a copy of the License at
*
*     http://www.apache.org/licenses/LICENSE-2.0
*
* Unless required by applicable law or agreed to in writing, software
* distributed under the License is distributed on an "AS IS" BASIS,
* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
* See the License for the specific language governing permissions and
* limitations under the License.
*/
package org.lilyproject.mapreduce;

import java.io.IOException;
import java.util.ArrayList;
import java.util.List;
import java.util.Map;

import org.lilyproject.util.hbase.RepoAndTableUtil;


import com.google.common.collect.Lists;
import com.google.common.collect.Maps;
import org.apache.commons.logging.Log;
import org.apache.commons.logging.LogFactory;
import org.apache.hadoop.conf.Configurable;
import org.apache.hadoop.conf.Configuration;
import org.apache.hadoop.hbase.client.HConnectionManager;
import org.apache.hadoop.hbase.client.HTable;
import org.apache.hadoop.hbase.mapreduce.TableSplit;
import org.apache.hadoop.hbase.util.Bytes;
import org.apache.hadoop.hbase.util.Pair;
import org.apache.hadoop.mapreduce.InputFormat;
import org.apache.hadoop.mapreduce.InputSplit;
import org.apache.hadoop.mapreduce.JobContext;
import org.codehaus.jackson.JsonNode;
import org.lilyproject.client.LilyClient;
import org.lilyproject.repository.api.LRepository;
import org.lilyproject.repository.api.RecordScan;
import org.lilyproject.repository.api.RepositoryException;
import org.lilyproject.repository.api.RepositoryTable;
import org.lilyproject.repository.api.TableManager;
import org.lilyproject.tools.import_.json.RecordScanReader;
import org.lilyproject.util.exception.ExceptionUtil;
import org.lilyproject.util.io.Closer;
import org.lilyproject.util.json.JsonFormat;
import org.lilyproject.util.zookeeper.ZkConnectException;
import org.lilyproject.util.zookeeper.ZkUtil;
import org.lilyproject.util.zookeeper.ZooKeeperItf;

/**
* A MapReduce InputFormat for Lily based on Lily scanners.
*/
public abstract class AbstractLilyScanInputFormat<KEYIN, VALUEIN> extends InputFormat<KEYIN, VALUEIN> implements Configurable {

    public static final String SCAN = "lily.mapreduce.scan";

    final Log log = LogFactory.getLog(AbstractLilyScanInputFormat.class);

    private Configuration conf;
    protected String zkConnectString;
    protected String repositoryName;

    @Override
    public void setConf(Configuration conf) {
        this.conf = conf;
        zkConnectString = conf.get(LilyMapReduceUtil.ZK_CONNECT_STRING);
        if (zkConnectString == null) {
            log.warn("ZooKeeper connection string not specified, will use 'localhost'.");
            zkConnectString = "localhost";
        }
        repositoryName = conf.get(LilyMapReduceUtil.REPOSITORY_NAME);
        if (repositoryName == null) {
            throw new RuntimeException("Missing " + LilyMapReduceUtil.REPOSITORY_NAME + " in job conf.");
        }
    }

    @Override
    public Configuration getConf() {
        return conf;
    }

    @Override
    public List<InputSplit> getSplits(JobContext jobContext) throws IOException, InterruptedException {
        ZooKeeperItf zk = null;
        LilyClient lilyClient = null;
        Configuration hbaseConf = null;
        List<InputSplit> inputSplits = Lists.newArrayList();
        try {
            zk = ZkUtil.connect(zkConnectString, 30000);

            // Need connection to Lily to parse RecordScan (a bit lame)
            lilyClient = null;
            try {
                lilyClient = new LilyClient(zk);
            } catch (Exception e) {
                throw new IOException("Error setting up LilyClient", e);
            }

            LRepository repository = lilyClient.getRepository(repositoryName);
            RecordScan scan = getScan(repository);

            // Determine start and stop row
            byte[] startRow;
            if (scan.getRawStartRecordId() != null) {
                startRow = scan.getRawStartRecordId();
            } else if (scan.getStartRecordId() != null) {
                startRow = scan.getStartRecordId().toBytes();
            } else {
                startRow = new byte[0];
            }

            byte[] stopRow;
            if (scan.getRawStopRecordId() != null) {
                stopRow = scan.getRawStopRecordId();
            } else if (scan.getStopRecordId() != null) {
                stopRow = scan.getStopRecordId().toBytes();
            } else {
                stopRow = new byte[0];
            }

            //
            hbaseConf = LilyClient.getHBaseConfiguration(zk);

            for (String tableName : getRepositoryHBaseTableNames(repository.getTableManager(),
                    jobContext.getConfiguration())) {
                HTable table = new HTable(hbaseConf, tableName);
                try {
                    inputSplits.addAll(getSplits(table, startRow, stopRow));
                } finally {
                    Closer.close(table);
                }
            }
            return inputSplits;
        } catch (ZkConnectException e) {
            throw new IOException("Error setting up splits", e);
        } catch (RepositoryException e) {
            throw new IOException("Error setting up splits", e);
        } finally {
            Closer.close(zk);
            if (hbaseConf != null) {
                HConnectionManager.deleteConnection(hbaseConf, true);
            }
            Closer.close(lilyClient);
        }
    }

    /**
     * Returns the HBase table names of the repository tables to be included in this job.
     */
    private List<String> getRepositoryHBaseTableNames(TableManager tableManager, Configuration conf)
            throws InterruptedException, IOException {
        // key is lily table name, value is hbase table name (= the name including the repository name prefix)
        Map<String, String> allRepoTables = Maps.newHashMap();
        for (RepositoryTable repoTable : tableManager.getTables()) {
            allRepoTables.put(repoTable.getName(),
                    RepoAndTableUtil.getHBaseTableName(repoTable.getRepositoryName(), repoTable.getName()));
        }

        List<String> tablesToInclude = Lists.newArrayList();
        String tableListStr = conf.get(LilyMapReduceUtil.REPOSITORY_TABLES);
        if (tableListStr == null) {
            tablesToInclude.addAll(allRepoTables.values());
        } else {
            for (String toInclude : tableListStr.split(",")) {
                if (!allRepoTables.containsKey(toInclude)) {
                    throw new IllegalArgumentException(String.format("'%s' is not a repository table", toInclude));
                }
                tablesToInclude.add(allRepoTables.get(toInclude));
            }
        }
        return tablesToInclude;

    }

    /**
     * License note: this code was copied from HBase's TableInputFormat.
     *
     * @param startRow start row of the scan
     * @param stopRow stop row of the scan
     */
    public List<InputSplit> getSplits(HTable table, final byte[] startRow, final byte[] stopRow) throws IOException {
        if (table == null) {
            throw new IOException("No table was provided.");
        }
        Pair<byte[][], byte[][]> keys = table.getStartEndKeys();
        if (keys == null || keys.getFirst() == null ||
                keys.getFirst().length == 0) {
            throw new IOException("Expecting at least one region.");
        }
        int count = 0;
        List<InputSplit> splits = new ArrayList<InputSplit>(keys.getFirst().length);
        for (int i = 0; i < keys.getFirst().length; i++) {
            if ( !includeRegionInSplit(keys.getFirst()[i], keys.getSecond()[i])) {
                continue;
            }
            String regionLocation = table.getRegionLocation(keys.getFirst()[i]).
                    getServerAddress().getHostname();
            // determine if the given start an stop key fall into the region
            if ((startRow.length == 0 || keys.getSecond()[i].length == 0 ||
                    Bytes.compareTo(startRow, keys.getSecond()[i]) < 0) &&
                    (stopRow.length == 0 ||
                            Bytes.compareTo(stopRow, keys.getFirst()[i]) > 0)) {
                byte[] splitStart = startRow.length == 0 ||
                        Bytes.compareTo(keys.getFirst()[i], startRow) >= 0 ?
                        keys.getFirst()[i] : startRow;
                byte[] splitStop = (stopRow.length == 0 ||
                        Bytes.compareTo(keys.getSecond()[i], stopRow) <= 0) &&
                        keys.getSecond()[i].length > 0 ?
                        keys.getSecond()[i] : stopRow;
                InputSplit split = new TableSplit(table.getTableName(),
                        splitStart, splitStop, regionLocation);
                splits.add(split);
                if (log.isDebugEnabled()) {
                    log.debug("getSplits: split -> " + (count++) + " -> " + split);
                }
            }
        }
        return splits;
    }

    protected boolean includeRegionInSplit(final byte[] startKey, final byte [] endKey) {
        return true;
    }

    protected RecordScan getScan(LRepository repository) {
        RecordScan scan;
        String scanData = conf.get(SCAN);
        if (scanData != null) {
            try {
                JsonNode node = JsonFormat.deserializeNonStd(scanData);
                scan = RecordScanReader.INSTANCE.fromJson(node, repository);
            } catch (Exception e) {
                ExceptionUtil.handleInterrupt(e);
                throw new RuntimeException(e);
            }
        } else {
            scan = new RecordScan();
        }
        return scan;
    }
}
TOP

Related Classes of org.lilyproject.mapreduce.AbstractLilyScanInputFormat

TOP
Copyright © 2018 www.massapi.com. All rights reserved.
All source code are property of their respective owners. Java is a trademark of Sun Microsystems, Inc and owned by ORACLE Inc. Contact coftware#gmail.com.