Package com.ngdata.hbaseindexer.mr

Source Code of com.ngdata.hbaseindexer.mr.HBaseIndexingOptions

/*
* Copyright 2013 NGDATA nv
*
* Licensed under the Apache License, Version 2.0 (the "License");
* you may not use this file except in compliance with the License.
* You may obtain a copy of the License at
*
*     http://www.apache.org/licenses/LICENSE-2.0
*
* Unless required by applicable law or agreed to in writing, software
* distributed under the License is distributed on an "AS IS" BASIS,
* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
* See the License for the specific language governing permissions and
* limitations under the License.
*/
package com.ngdata.hbaseindexer.mr;

import java.io.ByteArrayInputStream;
import java.io.File;
import java.io.IOException;
import java.text.ParseException;
import java.text.SimpleDateFormat;
import java.util.List;
import java.util.Map;
import java.util.UUID;

import com.google.common.annotations.VisibleForTesting;
import com.google.common.base.Preconditions;
import com.google.common.collect.Lists;
import com.google.common.collect.Maps;
import com.google.common.io.Files;
import com.ngdata.hbaseindexer.ConfKeys;
import com.ngdata.hbaseindexer.conf.DefaultIndexerComponentFactory;
import com.ngdata.hbaseindexer.conf.IndexerComponentFactory;
import com.ngdata.hbaseindexer.conf.IndexerComponentFactoryUtil;
import com.ngdata.hbaseindexer.conf.IndexerConf;
import com.ngdata.hbaseindexer.conf.IndexerConf.MappingType;
import com.ngdata.hbaseindexer.model.api.IndexerDefinition;
import com.ngdata.hbaseindexer.model.api.IndexerNotFoundException;
import com.ngdata.hbaseindexer.model.impl.IndexerModelImpl;
import com.ngdata.hbaseindexer.morphline.MorphlineResultToSolrMapper;
import com.ngdata.hbaseindexer.parse.ResultToSolrMapper;
import com.ngdata.hbaseindexer.util.zookeeper.StateWatchingZooKeeper;
import com.ngdata.sep.impl.HBaseShims;
import com.ngdata.sep.util.io.Closer;
import org.apache.hadoop.conf.Configuration;
import org.apache.hadoop.fs.Path;
import org.apache.hadoop.hbase.HTableDescriptor;
import org.apache.hadoop.hbase.client.Get;
import org.apache.hadoop.hbase.client.HBaseAdmin;
import org.apache.hadoop.hbase.client.Scan;
import org.apache.hadoop.hbase.util.Bytes;
import org.apache.hadoop.mapred.JobClient;
import org.apache.solr.hadoop.ForkedMapReduceIndexerTool.OptionsBridge;
import org.apache.solr.hadoop.ForkedZooKeeperInspector;
import org.apache.solr.hadoop.MapReduceIndexerTool;
import org.apache.solr.hadoop.MorphlineClasspathUtil;
import org.slf4j.Logger;
import org.slf4j.LoggerFactory;

/**
* Container for commandline options passed in for HBase Indexer, as well as a bridge to existing MapReduce index
* building functionality.
*/
class HBaseIndexingOptions extends OptionsBridge {

    private static final Logger LOG = LoggerFactory.getLogger(MapReduceIndexerTool.class);

    static final String DEFAULT_INDEXER_NAME = "_default_";

    private Configuration conf;
    @VisibleForTesting
    protected HBaseAdmin hBaseAdmin;
    private List<Scan> scans;
    private IndexingSpecification hbaseIndexingSpecification;
    // Flag that we have created our own output directory
    private boolean generatedOutputDir = false;

    public String hbaseIndexerZkHost;
    public String hbaseIndexerName = DEFAULT_INDEXER_NAME;
    public String hbaseIndexerComponentFactory;
    public File hbaseIndexerConfigFile;
    public String hbaseTableName;
    public String hbaseStartRow;
    public String hbaseEndRow;
    public String hbaseStartTimeString;
    public String hbaseEndTimeString;
    public String hbaseTimestampFormat;
    public boolean overwriteOutputDir;
    public boolean clearIndex;

    public HBaseIndexingOptions(Configuration conf) {
        Preconditions.checkNotNull(conf);
        this.conf = conf;
    }

    /**
     * Determine if this is intended to be a map-only job that writes directly to a live Solr server.
     *
     * @return true if writes are to be done directly into Solr
     */
    public boolean isDirectWrite() {
        return reducers == 0;
    }

    public List<Scan> getScans() {
        if (scans == null) {
            throw new IllegalStateException("Scan has not yet been evaluated");
        }
        return scans;
    }

    public IndexingSpecification getIndexingSpecification() {
        if (hbaseIndexingSpecification == null) {
            throw new IllegalStateException("Indexing specification has not yet been evaluated");
        }
        return hbaseIndexingSpecification;
    }

    /**
     * Evaluate the current state to calculate derived options settings. Validation of the state is also done here, so
     * IllegalStateException will be thrown if incompatible options have been set.
     *
     * @throws IllegalStateException if an illegal combination of options has been set, or needed options are missing
     */
    public void evaluate() {
        evaluateIndexingSpecification();
        evaluateOutputDir();
        evaluateGoLiveArgs();
        evaluateShards();
        evaluateNumReducers();
        evaluateScan();
    }

    /**
     * Check the existence of an output directory setting if needed, or setting one if applicable.
     */
    @VisibleForTesting
    void evaluateOutputDir() {

        if (isDirectWrite() || isDryRun) {
            if (outputDir != null) {
                throw new IllegalStateException(
                        "--output-dir must not be specified if --reducers is 0 or --dry-run is enabled");
            }
            if (zkHost == null && (hbaseIndexerName == null || hbaseIndexerZkHost == null)) {
                throw new IllegalStateException(
                        "--zk-host must be specified if --reducers is 0 or --dry-run is enabled");
            }
            return;
        }

        if (goLive) {
            if (outputDir == null) {
                outputDir = new Path(conf.get("hbase.search.mr.tmpdir", "/tmp"), "search-"
                        + UUID.randomUUID().toString());
                generatedOutputDir = true;
            }
        } else {
            if (outputDir == null) {
                throw new IllegalStateException("Must supply --output-dir unless --go-live is enabled");
            }
        }
    }

    /**
     * Check if the output directory being used is an auto-generated temporary directory.
     */
    public boolean isGeneratedOutputDir() {
        return generatedOutputDir;
    }

    @VisibleForTesting
    void evaluateScan() {
        this.scans = Lists.newArrayList();
        IndexerComponentFactory factory = IndexerComponentFactoryUtil.getComponentFactory(hbaseIndexingSpecification.getIndexerComponentFactory(), new ByteArrayInputStream(hbaseIndexingSpecification.getConfiguration()), hbaseIndexingSpecification.getIndexConnectionParams());
        IndexerConf indexerConf = factory.createIndexerConf();
        applyMorphLineParams(indexerConf);
        List<byte[]> tableNames = Lists.newArrayList();
        String tableNameSpec = indexerConf.getTable();
        if (indexerConf.tableNameIsRegex()) {
            HTableDescriptor[] tables;
            try {
                HBaseAdmin admin = getHbaseAdmin();
                tables = admin.listTables(tableNameSpec);
            } catch (IOException e) {
                throw new RuntimeException("Error occurred fetching hbase tables", e);
            }
            for (HTableDescriptor descriptor : tables) {
                tableNames.add(descriptor.getName());
            }
        } else {
            tableNames.add(Bytes.toBytesBinary(tableNameSpec));
        }
       
        for (byte[] tableName : tableNames) {
            Scan hbaseScan = new Scan();
            hbaseScan.setCacheBlocks(false);
            hbaseScan.setCaching(conf.getInt("hbase.client.scanner.caching", 200));

            if (hbaseStartRow != null) {
                hbaseScan.setStartRow(Bytes.toBytesBinary(hbaseStartRow));
                LOG.debug("Starting row scan at " + hbaseStartRow);
            }

            if (hbaseEndRow != null) {
                hbaseScan.setStopRow(Bytes.toBytesBinary(hbaseEndRow));
                LOG.debug("Stopping row scan at " + hbaseEndRow);
            }

            Long startTime = evaluateTimestamp(hbaseStartTimeString, hbaseTimestampFormat);
            Long endTime = evaluateTimestamp(hbaseEndTimeString, hbaseTimestampFormat);

            if (startTime != null || endTime != null) {
                long scanStartTime = 0L;
                long scanEndTime = Long.MAX_VALUE;
                if (startTime != null) {
                    scanStartTime = startTime;
                    LOG.debug("Setting scan start of time range to " + startTime);
                }
                if (endTime != null) {
                    scanEndTime = endTime;
                    LOG.debug("Setting scan end of time range to " + endTime);
                }
                try {
                    hbaseScan.setTimeRange(scanStartTime, scanEndTime);
                } catch (IOException e) {
                    // In reality an IOE will never be thrown here
                    throw new RuntimeException(e);
                }
            }
            // Only scan the column families and/or cells that the indexer requires
            // if we're running in row-indexing mode       
            if (indexerConf.getMappingType() == MappingType.ROW) {
                MorphlineClasspathUtil.setupJavaCompilerClasspath();

                ResultToSolrMapper resultToSolrMapper = factory.createMapper(
                        hbaseIndexingSpecification.getIndexerName()
                );
                Get get = resultToSolrMapper.getGet(HBaseShims.newGet().getRow());
                hbaseScan.setFamilyMap(get.getFamilyMap());
            }
            hbaseScan.setAttribute(Scan.SCAN_ATTRIBUTES_TABLE_NAME, tableName);

            scans.add(hbaseScan);
        }
    }

    // Taken from org.apache.solr.hadoop.MapReduceIndexerTool
    @VisibleForTesting
    void evaluateGoLiveArgs() {
        if (goLive && zkHost == null && solrHomeDir == null) {
            throw new IllegalStateException("--go-live requires at least one of --zk-host or --solr-home-dir");
        }
        if (goLive && zkHost == null && shardUrls == null) {
            throw new IllegalStateException("--go-live requires that you also pass --shard-url or --zk-host");
        }

        if (zkHost != null && collection == null) {
            throw new IllegalStateException("--zk-host requires that you also pass --collection");
        }

        if (zkHost != null) {
            return;
            // verify structure of ZK directory later, to avoid checking run-time errors during parsing.
        } else if (shardUrls != null) {
            if (shardUrls.size() == 0) {
                throw new IllegalStateException("--shard-url requires at least one URL");
            }
        } else if (shards != null) {
            if (shards <= 0) {
                throw new IllegalStateException("--shards must be a positive number: " + shards);
            }
        } else {
            throw new IllegalStateException("You must specify one of the following (mutually exclusive) arguments: "
                    + "--zk-host or --shard-url or --shards");
        }

        if (shardUrls != null) {
            shards = shardUrls.size();
        }

    }

    private HBaseAdmin getHbaseAdmin() throws IOException {
        if (hBaseAdmin == null) {
            hBaseAdmin = new HBaseAdmin(conf);
        }
        return hBaseAdmin;
    }

    // Taken from org.apache.solr.hadoop.MapReduceIndexerTool
    private void evaluateShards() {
        if (zkHost != null) {
            assert collection != null;
            ForkedZooKeeperInspector zki = new ForkedZooKeeperInspector();
            try {
                shardUrls = zki.extractShardUrls(zkHost, collection);
            } catch (Exception e) {
                LOG.debug("Cannot extract SolrCloud shard URLs from ZooKeeper", e);
                throw new RuntimeException("Cannot extract SolrCloud shard URLs from ZooKeeper", e);
            }
            assert shardUrls != null;
            if (shardUrls.size() == 0) {
                throw new IllegalStateException("--zk-host requires ZooKeeper " + zkHost
                        + " to contain at least one SolrCore for collection: " + collection);
            }
            shards = shardUrls.size();
            LOG.debug("Using SolrCloud shard URLs: {}", shardUrls);
        }
    }

    // Taken from org.apache.solr.hadoop.MapReduceIndexerTool
    private void evaluateNumReducers() {

        if (isDirectWrite()) {
            return;
        }

        if (shards <= 0) {
            throw new IllegalStateException("Illegal number of shards: " + shards);
        }
        if (fanout <= 1) {
            throw new IllegalStateException("Illegal fanout: " + fanout);
        }

        int reduceTaskCount;
        try {
            // MR1
            reduceTaskCount = new JobClient(conf).getClusterStatus().getMaxReduceTasks();
        } catch (IOException e) {
            throw new RuntimeException(e);
        }
        LOG.info("Cluster reports {} reduce slots", reduceTaskCount);

        if (reducers == -2) {
            reduceTaskCount = shards;
        } else if (reducers != -1) {
            reduceTaskCount = reducers;
        }
        reduceTaskCount = Math.max(reduceTaskCount, shards);

        if (reduceTaskCount != shards) {
            // Ensure fanout isn't misconfigured. fanout can't meaningfully be larger than what would be
            // required to merge all leaf shards in one single tree merge iteration into root shards
            fanout = Math.min(fanout, (int) ceilDivide(reduceTaskCount, shards));

            // Ensure invariant reducers == options.shards * (fanout ^ N) where N is an integer >= 1.
            // N is the number of mtree merge iterations.
            // This helps to evenly spread docs among root shards and simplifies the impl of the mtree merge algorithm.
            int s = shards;
            while (s < reduceTaskCount) {
                s = s * fanout;
            }
            reduceTaskCount = s;
            assert reduceTaskCount % fanout == 0;
        }
        this.reducers = reduceTaskCount;
    }

    // same as IntMath.divide(p, q, RoundingMode.CEILING)
    private long ceilDivide(long p, long q) {
        long result = p / q;
        if (p % q != 0) {
            result++;
        }
        return result;
    }

    @VisibleForTesting
    void evaluateIndexingSpecification() {
        String tableName = null;
        byte[] configuration = null;
        Map<String,String> indexConnectionParams = Maps.newHashMap();

        if (hbaseIndexerZkHost != null) {

            if (hbaseIndexerName == null) {
                throw new IllegalStateException("--hbase-indexer-name must be supplied if --hbase-indexer-zk is specified");
            }

            StateWatchingZooKeeper zk = null;
            try {
                zk = new StateWatchingZooKeeper(hbaseIndexerZkHost, 30000);
                IndexerModelImpl indexerModel = new IndexerModelImpl(zk, conf.get(ConfKeys.ZK_ROOT_NODE, "/ngdata/hbaseindexer"));
                IndexerDefinition indexerDefinition = indexerModel.getIndexer(hbaseIndexerName);
                hbaseIndexerComponentFactory = indexerDefinition.getIndexerComponentFactory();
                configuration = indexerDefinition.getConfiguration();
                if (indexerDefinition.getConnectionParams() != null) {
                    indexConnectionParams.putAll(indexerDefinition.getConnectionParams());
                }
                if (zkHost == null) {
                    zkHost = indexConnectionParams.get("solr.zk");
                }
                if (collection == null) {
                    collection = indexConnectionParams.get("solr.collection");
                }
                indexerModel.stop();
            } catch (IndexerNotFoundException infe) {
                throw new IllegalStateException("Indexer " + hbaseIndexerName + " doesn't exist");
            } catch (Exception e) {
                // We won't bother trying to do any recovery here if things don't work out,
                // so we just throw the wrapped exception up the stack
                throw new RuntimeException(e);
            } finally {
                Closer.close(zk);
            }
        } else {
            if (hbaseIndexerComponentFactory == null) {
                hbaseIndexerComponentFactory = DefaultIndexerComponentFactory.class.getName();
            }

            if (hbaseIndexerConfigFile == null) {
                throw new IllegalStateException(
                        "--hbase-indexer-file must be specified if --hbase-indexer-zk is not specified");
            }
            if (solrHomeDir == null) {
                if (zkHost == null) {
                    throw new IllegalStateException(
                            "--zk-host must be specified if --hbase-indexer-zk is not specified");
                }
                if (collection == null) {
                    throw new IllegalStateException(
                            "--collection must be specified if --hbase-indexer-zk is not specified");
                }
            }
        }


        if (this.hbaseIndexerConfigFile != null) {
            try {
                configuration = Files.toByteArray(hbaseIndexerConfigFile);
            } catch (IOException e) {
                throw new RuntimeException("Error loading " + hbaseIndexerConfigFile, e);
            }
        }

        if (solrHomeDir != null) {
            indexConnectionParams.put("solr.mode", "classic");
            indexConnectionParams.put("solr.home", solrHomeDir.getAbsolutePath());
        } else {
            if (zkHost != null) {
                indexConnectionParams.put("solr.zk", zkHost);
            }

            if (collection != null) {
                indexConnectionParams.put("solr.collection", collection);
            }
        }

        ByteArrayInputStream is = new ByteArrayInputStream(configuration);
        IndexerComponentFactory factory = IndexerComponentFactoryUtil.getComponentFactory(hbaseIndexerComponentFactory, is, indexConnectionParams);
        IndexerConf indexerConf = factory.createIndexerConf();
        applyMorphLineParams(indexerConf);

        if (hbaseTableName != null) {
            tableName = hbaseTableName;
        } else {
            tableName = indexerConf.getTable();
        }

        if (hbaseIndexerName == null) {
            hbaseIndexerName = DEFAULT_INDEXER_NAME;
        }

        this.hbaseIndexingSpecification = new IndexingSpecification(
                                            tableName,
                                            hbaseIndexerName, hbaseIndexerComponentFactory,
                                            configuration, indexConnectionParams);
    }

    private void applyMorphLineParams(IndexerConf indexerConf) {
        Map<String, String> params = indexerConf.getGlobalParams();
        if (morphlineFile != null) {
            params.put(
                    MorphlineResultToSolrMapper.MORPHLINE_FILE_PARAM, morphlineFile.getPath());
        }
        if (morphlineId != null) {
            params.put(
                    MorphlineResultToSolrMapper.MORPHLINE_ID_PARAM, morphlineId);
        }

        for (Map.Entry<String, String> entry : conf) {
            if (entry.getKey().startsWith(MorphlineResultToSolrMapper.MORPHLINE_VARIABLE_PARAM + ".")) {
                params.put(entry.getKey(), entry.getValue());
            }
            if (entry.getKey().startsWith(MorphlineResultToSolrMapper.MORPHLINE_FIELD_PARAM + ".")) {
                params.put(entry.getKey(), entry.getValue());
            }
        }

        indexerConf.setGlobalParams(params);
    }

    /**
     * Evaluate a timestamp string with an optional format. If the format is not present,
     * the timestamp string is assumed to be a Long.
     *
     * @return evaluated timestamp, or null if there is no timestamp information supplied
     */
    static Long evaluateTimestamp(String timestampString, String timestampFormat) {
        if (timestampString == null) {
            return null;
        }
        if (timestampFormat == null) {
            try {
                return Long.parseLong(timestampString);
            } catch (NumberFormatException e) {
                throw new IllegalStateException("Invalid timestamp value: " + timestampString);
            }
        } else {
            SimpleDateFormat dateTimeFormat = null;
            try {
                dateTimeFormat = new SimpleDateFormat(timestampFormat);
            } catch (IllegalArgumentException e) {
                throw new IllegalStateException("Invalid timestamp format: " + e.getMessage());
            }
            try {
                return dateTimeFormat.parse(timestampString).getTime();
            } catch (ParseException e) {
                throw new IllegalStateException("Can't parse timestamp string '"
                        + timestampString + "': " + e.getMessage());
            }
        }
    }
}
TOP

Related Classes of com.ngdata.hbaseindexer.mr.HBaseIndexingOptions

TOP
Copyright © 2018 www.massapi.com. All rights reserved.
All source code are property of their respective owners. Java is a trademark of Sun Microsystems, Inc and owned by ORACLE Inc. Contact coftware#gmail.com.