Source Code of org.apache.pig.backend.hadoop.executionengine.mapReduceLayer.partitioners.WeightedRangePartitioner

/*
 * Licensed to the Apache Software Foundation (ASF) under one
 * or more contributor license agreements.  See the NOTICE file
 * distributed with this work for additional information
 * regarding copyright ownership.  The ASF licenses this file
 * to you under the Apache License, Version 2.0 (the
 * "License"); you may not use this file except in compliance
 * with the License.  You may obtain a copy of the License at
 *
 *     http://www.apache.org/licenses/LICENSE-2.0
 *
 * Unless required by applicable law or agreed to in writing, software
 * distributed under the License is distributed on an "AS IS" BASIS,
 * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 * See the License for the specific language governing permissions and
 * limitations under the License.
 */
package org.apache.pig.backend.hadoop.executionengine.mapReduceLayer.partitioners;




import java.io.InputStream;
import java.util.ArrayList;
import java.util.Arrays;
import java.util.HashMap;
import java.util.Map;
import java.util.Map.Entry;


import org.apache.hadoop.io.RawComparator;
import org.apache.hadoop.io.Writable;
import org.apache.hadoop.mapred.JobConf;
import org.apache.hadoop.mapred.Partitioner;
import org.apache.pig.backend.hadoop.HDataType;
import org.apache.pig.backend.hadoop.datastorage.ConfigurationUtil;
import org.apache.pig.builtin.BinStorage;
import org.apache.pig.data.DataBag;
import org.apache.pig.data.Tuple;
import org.apache.pig.data.TupleFactory;
import org.apache.pig.impl.io.BufferedPositionedInputStream;
import org.apache.pig.impl.io.FileLocalizer;
import org.apache.pig.impl.io.NullableBytesWritable;
import org.apache.pig.impl.io.NullableDoubleWritable;
import org.apache.pig.impl.io.NullableFloatWritable;
import org.apache.pig.impl.io.NullableIntWritable;
import org.apache.pig.impl.io.NullableLongWritable;
import org.apache.pig.impl.io.NullableText;
import org.apache.pig.impl.io.NullableTuple;
import org.apache.pig.impl.io.PigNullableWritable;


public class WeightedRangePartitioner implements Partitioner<PigNullableWritable, Writable> {
    PigNullableWritable[] quantiles;
    RawComparator<PigNullableWritable> comparator;
    Integer numQuantiles;
    DataBag samples;
    public static Map<PigNullableWritable,DiscreteProbabilitySampleGenerator> weightedParts = new HashMap<PigNullableWritable, DiscreteProbabilitySampleGenerator>();
    JobConf job;


    public int getPartition(PigNullableWritable key, Writable value,
            int numPartitions){
        if(!weightedParts.containsKey(key)){
            int index = Arrays.binarySearch(quantiles, key, comparator);
            if (index < 0)
                index = -index-1;
            else
                index = index + 1;
            return Math.min(index, numPartitions - 1);
        }
        DiscreteProbabilitySampleGenerator gen = weightedParts.get(key);
        return gen.getNext();
    }


    public void configure(JobConf job) {
        this.job = job;
        String quantilesFile = job.get("pig.quantilesFile", "");
        comparator = job.getOutputKeyComparator();
        if (quantilesFile.length() == 0)
            throw new RuntimeException(this.getClass().getSimpleName() + " used but no quantiles found");
        
        try{
            InputStream is = FileLocalizer.openDFSFile(quantilesFile,ConfigurationUtil.toProperties(job));
            BinStorage loader = new BinStorage();
            ArrayList<PigNullableWritable> quantilesList = new ArrayList<PigNullableWritable>();
            loader.bindTo(quantilesFile, new BufferedPositionedInputStream(is), 0, Long.MAX_VALUE);
            Tuple t = loader.getNext();
            if(t==null) throw new RuntimeException("Empty samples file");
            // the Quantiles file has a tuple as under:
            // (numQuantiles, bag of samples) 
            // numQuantiles here is the reduce parallelism
            numQuantiles = (Integer) t.get(0);
            samples = (DataBag) t.get(1);
            long numSamples = samples.size();
            long toSkip = numSamples / numQuantiles;
            if(toSkip == 0) {
                // numSamples is < numQuantiles;
                // set numQuantiles to numSamples
                numQuantiles = (int)numSamples;
                toSkip = 1;
            }
            
            long ind=0, j=-1, nextQuantile = toSkip-1;
            for (Tuple it : samples) {
                if (ind==nextQuantile){
                    ++j;
                    quantilesList.add(getPigNullableWritable(it));
                    nextQuantile+=toSkip;
                    if(j==numQuantiles-1)
                        break;
                }
                ind++;
                //TODO how do we report progress?
                //if (i % 1000 == 0) progress();
                // Currently there is no way to report progress since 
                // in configure() we cannot get a handle to the reporter
                // (even PhysicalOperator.getReporter() does not work! It is
                // set to null.) Hopefully the work done here wll be < 10 minutes
                // since we are dealing with 100* num_mapper samples. When
                // RandomSampleLoader becomes an operator or UDF instead of a
                // loader hopefully we can intelligently decide on the number
                // of samples (instead of the static 100) and then not being
                // able to report progress may not be a big issue.
            }
            convertToArray(quantilesList);
            long i=-1;
            Map<PigNullableWritable,CountingMap<Integer>> contribs = new HashMap<PigNullableWritable, CountingMap<Integer>>();
            for (Tuple it : samples){
                ++i;
                PigNullableWritable sample = getPigNullableWritable(it);
                int partInd = new Long(i/toSkip).intValue(); // which partition
                if(partInd==numQuantiles) break;
                // the quantiles array has the element from the sample which is the
                // last element for a given partition. For example: if numQunatiles 
                // is 5 and number of samples is 100, then toSkip = 20 
                // quantiles[0] = sample[19] // the 20th element
                // quantiles[1] = sample[39] // the 40th element
                // and so on. For any element in the sample between 0 and 19, partInd
                // will be 0. We want to check if a sample element which is
                // present between 0 and 19 and is also the 19th (quantiles[0] element).
                // This would mean that element might spread over the 0th and 1st 
                // partition. We are looking for contributions to a partition
                // from such elements. 
                
                // First We only check for sample elements in partitions other than the last one
                // < numQunatiles -1 (partInd is 0 indexed). 
                if(partInd<numQuantiles-1 && areEqual(sample,quantiles[partInd])){
                    if(!contribs.containsKey(sample)){
                        CountingMap<Integer> cm = new CountingMap<Integer>();
                        cm.put(partInd, 1);
                        contribs.put(sample, cm);
                    }
                    else
                        contribs.get(sample).put(partInd, 1);
                }
                else{ 
                    // we are either in the last partition (last quantile)
                    // OR the sample element we are currently processing is not
                    // the same as the element in the quantile array for this partition
                    // if we haven't seen this sample item earlier, this is not an
                    // element which crosses partitions - so ignore
                    if(!contribs.containsKey(sample))
                        continue;
                    else
                        // we have seen this sample before (in a previous partInd), 
                        // add to the contribution associated with this sample - if we had 
                        // not seen this sample in a previous partInd, then we have not
                        // had this in the contribs map! (because of the if above).This 
                        // sample can either go to the previous partInd or this partInd 
                        // in the final sort reduce stage. That is where the amount of 
                        // contribution to each partInd will matter and influence the choice.
                        contribs.get(sample).put(partInd, 1);
                }
            }
            for(Entry<PigNullableWritable, CountingMap<Integer>> ent : contribs.entrySet()){
                PigNullableWritable key = ent.getKey(); // sample item which repeats
                
                // this map will have the contributions of the sample item to the different partitions
                CountingMap<Integer> value = ent.getValue(); 
                
                long total = value.getTotalCount();
                float[] probVec = new float[numQuantiles];
                // for each partition that this sample item is present in,
                // compute the fraction of the total occurences for that
                // partition - this will be the probability with which we
                // will pick this partition in the final sort reduce job
                // for this sample item
                for (Entry<Integer,Integer> valEnt : value.entrySet()) {
                    probVec[valEnt.getKey()] = (float)valEnt.getValue()/total;
                }
//                weightedParts.put(key, new DiscreteProbabilitySampleGenerator(11317,probVec));
                weightedParts.put(key, new DiscreteProbabilitySampleGenerator(probVec));
            }
        }catch (Exception e){
            throw new RuntimeException(e);
        }
    }


    private PigNullableWritable getPigNullableWritable(Tuple t) {
        try {
            // user comparators work with tuples - so if user comparator
            // is being used OR if there are more than 1 sort cols, use
            // NullableTuple
            if ("true".equals(job.get("pig.usercomparator")) || t.size() > 1) {
                return new NullableTuple(t);
            } else {
                Object o = t.get(0);
                String kts = job.get("pig.reduce.key.type");
                if (kts == null) {
                    throw new RuntimeException("Didn't get reduce key type "
                        + "from config file.");
                }
                return HDataType.getWritableComparableTypes(o,
                    Byte.valueOf(kts));
            }
        } catch (Exception e) {
            throw new RuntimeException(e);
        }
    }


    private boolean areEqual(PigNullableWritable sample, PigNullableWritable writable) {
        return comparator.compare(sample, writable)==0;
    }


    private void convertToArray(
            ArrayList<PigNullableWritable> q) {
        if ("true".equals(job.get("pig.usercomparator")) ||
                q.get(0).getClass().equals(NullableTuple.class)) {
            quantiles = q.toArray(new NullableTuple[0]);
        } else if (q.get(0).getClass().equals(NullableBytesWritable.class)) {
            quantiles = q.toArray(new NullableBytesWritable[0]);
        } else if (q.get(0).getClass().equals(NullableDoubleWritable.class)) {
            quantiles = q.toArray(new NullableDoubleWritable[0]);
        } else if (q.get(0).getClass().equals(NullableFloatWritable.class)) {
            quantiles = q.toArray(new NullableFloatWritable[0]);
        } else if (q.get(0).getClass().equals(NullableIntWritable.class)) {
            quantiles = q.toArray(new NullableIntWritable[0]);
        } else if (q.get(0).getClass().equals(NullableLongWritable.class)) {
            quantiles = q.toArray(new NullableLongWritable[0]);
        } else if (q.get(0).getClass().equals(NullableText.class)) {
            quantiles = q.toArray(new NullableText[0]);
        } else {
            throw new RuntimeException("Unexpected class in " + this.getClass().getSimpleName());
        }
    }
}
Source Code of org.apache.pig.backend.hadoop.executionengine.mapReduceLayer.partitioners.WeightedRangePartitioner

Related Classes of org.apache.pig.backend.hadoop.executionengine.mapReduceLayer.partitioners.WeightedRangePartitioner