Package org.apache.pig.data

Examples of org.apache.pig.data.InternalMap


        if(in==null || in.size()==0)
            return null;
        Integer numQuantiles = null;
        DataBag samples = null;
        ArrayList<Tuple> quantilesList = new ArrayList<Tuple>();
        InternalMap weightedParts = new InternalMap();
        // the sample file has a tuple as under:
        // (numQuantiles, bag of samples)
        // numQuantiles here is the reduce parallelism
        try{
            numQuantiles = (Integer)in.get(0);
            samples = (DataBag)in.get(1);
           
            long numSamples = samples.size();
            long toSkip = numSamples / numQuantiles;
            if(toSkip == 0) {
                // numSamples is < numQuantiles;
                // set numQuantiles to numSamples
                numQuantiles = (int)numSamples;
                toSkip = 1;
            }
           
            long ind=0, j=-1, nextQuantile = toSkip-1;
            for (Tuple it : samples) {
                if (ind==nextQuantile){
                    ++j;
                    quantilesList.add(it);
                    nextQuantile+=toSkip;
                    if(j==numQuantiles-1)
                        break;
                }
                ind++;
                if (ind % 1000 == 0) progress();
            }
            long i=-1;
            Map<Tuple,CountingMap<Integer>> contribs = new HashMap<Tuple, CountingMap<Integer>>();
            for (Tuple it : samples){
                ++i;
                if (i % 1000 == 0) progress();
                int partInd = (int)(i/toSkip); // which partition
                if(partInd==numQuantiles) break;
                // the quantiles array has the element from the sample which is the
                // last element for a given partition. For example: if numQuantiles
                // is 5 and number of samples is 100, then toSkip = 20
                // quantiles[0] = sample[19] // the 20th element
                // quantiles[1] = sample[39] // the 40th element
                // and so on. For any element in the sample between 0 and 19, partInd
                // will be 0. We want to check if a sample element which is
                // present between 0 and 19 is also the 19th (quantiles[0] element).
                // This would mean that element might spread over the 0th and 1st
                // partition. We are looking for contributions to a partition
                // from such elements.
               
                // First We only check for sample elements in partitions other than the last one
                // < numQuantiles -1 (partInd is 0 indexed).
                if(partInd<numQuantiles-1 && areEqual(it,quantilesList.get(partInd))){
                    if(!contribs.containsKey(it)){
                        CountingMap<Integer> cm = new CountingMap<Integer>();
                        cm.put(partInd, 1);
                        contribs.put(it, cm);
                    }
                    else
                        contribs.get(it).put(partInd, 1);
                }
                else{
                    // we are either in the last partition (last quantile)
                    // OR the sample element we are currently processing is not
                    // the same as the element in the quantile array for this partition
                    // if we haven't seen this sample item earlier, this is not an
                    // element which crosses partitions - so ignore
                    if(!contribs.containsKey(it))
                        continue;
                    else
                        // we have seen this sample before (in a previous partInd),
                        // add to the contribution associated with this sample - if we had
                        // not seen this sample in a previous partInd, then we would have not
                        // had this in the contribs map! (because of the if above).This
                        // "key" (represented by the sample item) can either go to the
                        // previous partInd or this partInd in the final sort reduce stage.
                        // That is where the amount of contribution to each partInd will
                        // matter and influence the choice.
                        contribs.get(it).put(partInd, 1);
                }
            }
            int k = 0;
            for(Entry<Tuple, CountingMap<Integer>> ent : contribs.entrySet()){
                if (k % 1000 == 0) progress();
                Tuple key = ent.getKey(); // sample item which repeats
               
                // this map will have the contributions of the sample item to the different partitions
                CountingMap<Integer> value = ent.getValue();
               
                long total = value.getTotalCount();
                Tuple probVec =  mTupleFactory.newTuple(numQuantiles.intValue());
                // initialize all contribution fractions for different
                // partitions to 0.0
                for (int l = 0; l < numQuantiles; l++) {
                    probVec.set(l, new Float(0.0));
                }
                // for each partition that this sample item is present in,
                // compute the fraction of the total occurences for that
                // partition - this will be the probability with which we
                // will pick this partition in the final sort reduce job
                // for this sample item
                for (Entry<Integer,Integer> valEnt : value.entrySet()) {
                    probVec.set(valEnt.getKey(), (float)valEnt.getValue()/total);
                }
                weightedParts.put(key, probVec);
            }
            output.put(QUANTILES_LIST, mBagFactory.newDefaultBag(quantilesList));
            output.put(WEIGHTED_PARTS, weightedParts);
            return output;
        }catch (Exception e){
View Full Code Here


     * @param map
     *            LazyMap
     * @return InternalMap
     */
    public static InternalMap parseLazyMapToPigMap(LazyMap map) {
  InternalMap pigmap = new InternalMap();

  Map<Object, Object> javamap = map.getMap();

  if (javamap != null) {

      // for each item in the map extract the java primitive type
      for (Entry<Object, Object> entry : javamap.entrySet()) {
    pigmap.put(extractPigTypeFromHiveType(entry.getKey()),
      extractPigTypeFromHiveType(entry.getValue()));
      }

  }

View Full Code Here

                // the Quantiles file has a tuple as under:
                // (numQuantiles, bag of samples)
                // numQuantiles here is the reduce parallelism
                Map<String, Object> quantileMap = (Map<String, Object>) t.get(0);
                quantilesList = (DataBag) quantileMap.get(FindQuantiles.QUANTILES_LIST);
                InternalMap weightedPartsData = (InternalMap) quantileMap.get(FindQuantiles.WEIGHTED_PARTS);
                convertToArray(quantilesList);
                for(Entry<Object, Object> ent : weightedPartsData.entrySet()){
                    Tuple key = (Tuple)ent.getKey(); // sample item which repeats
                    float[] probVec = getProbVec((Tuple)ent.getValue());
                    weightedParts.put(getPigNullableWritable(key),
                            new DiscreteProbabilitySampleGenerator(probVec));
                }
View Full Code Here

     * @param map
     *            LazyMap
     * @return InternalMap
     */
    public static InternalMap parseLazyMapToPigMap(LazyMap map) {
        InternalMap pigmap = new InternalMap();

        Map<Object, Object> javamap = map.getMap();

        if (javamap != null) {

            // for each item in the map extract the java primitive type
            for (Entry<Object, Object> entry : javamap.entrySet()) {
                pigmap.put(extractPigTypeFromHiveType(entry.getKey()),
                        extractPigTypeFromHiveType(entry.getValue()));
            }

        }

View Full Code Here

                // the Quantiles file has a tuple as under:
                // (numQuantiles, bag of samples)
                // numQuantiles here is the reduce parallelism
                Map<String, Object> quantileMap = (Map<String, Object>) t.get(0);
                quantilesList = (DataBag) quantileMap.get(FindQuantiles.QUANTILES_LIST);
                InternalMap weightedPartsData = (InternalMap) quantileMap.get(FindQuantiles.WEIGHTED_PARTS);
                convertToArray(quantilesList);
                for(Entry<Object, Object> ent : weightedPartsData.entrySet()){
                    Tuple key = (Tuple)ent.getKey(); // sample item which repeats
                    float[] probVec = getProbVec((Tuple)ent.getValue());
                    weightedParts.put(getPigNullableWritable(key),
                            new DiscreteProbabilitySampleGenerator(probVec));
                }
View Full Code Here

        if(in==null || in.size()==0)
            return null;
        Integer numQuantiles = null;
        DataBag samples = null;
        ArrayList<Tuple> quantilesList = new ArrayList<Tuple>();
        InternalMap weightedParts = new InternalMap();
        // the sample file has a tuple as under:
        // (numQuantiles, bag of samples)
        // numQuantiles here is the reduce parallelism
        try{
            numQuantiles = (Integer)in.get(0);
            samples = (DataBag)in.get(1);
           
            long numSamples = samples.size();
            long toSkip = numSamples / numQuantiles;
            if(toSkip == 0) {
                // numSamples is < numQuantiles;
                // set numQuantiles to numSamples
                numQuantiles = (int)numSamples;
                toSkip = 1;
            }
           
            long ind=0, j=-1, nextQuantile = toSkip-1;
            for (Tuple it : samples) {
                if (ind==nextQuantile){
                    ++j;
                    quantilesList.add(it);
                    nextQuantile+=toSkip;
                    if(j==numQuantiles-1)
                        break;
                }
                ind++;
                if (ind % 1000 == 0) progress();
            }
            long i=-1;
            Map<Tuple,CountingMap<Integer>> contribs = new HashMap<Tuple, CountingMap<Integer>>();
            for (Tuple it : samples){
                ++i;
                if (i % 1000 == 0) progress();
                int partInd = new Long(i/toSkip).intValue(); // which partition
                if(partInd==numQuantiles) break;
                // the quantiles array has the element from the sample which is the
                // last element for a given partition. For example: if numQuantiles
                // is 5 and number of samples is 100, then toSkip = 20
                // quantiles[0] = sample[19] // the 20th element
                // quantiles[1] = sample[39] // the 40th element
                // and so on. For any element in the sample between 0 and 19, partInd
                // will be 0. We want to check if a sample element which is
                // present between 0 and 19 is also the 19th (quantiles[0] element).
                // This would mean that element might spread over the 0th and 1st
                // partition. We are looking for contributions to a partition
                // from such elements.
               
                // First We only check for sample elements in partitions other than the last one
                // < numQuantiles -1 (partInd is 0 indexed).
                if(partInd<numQuantiles-1 && areEqual(it,quantilesList.get(partInd))){
                    if(!contribs.containsKey(it)){
                        CountingMap<Integer> cm = new CountingMap<Integer>();
                        cm.put(partInd, 1);
                        contribs.put(it, cm);
                    }
                    else
                        contribs.get(it).put(partInd, 1);
                }
                else{
                    // we are either in the last partition (last quantile)
                    // OR the sample element we are currently processing is not
                    // the same as the element in the quantile array for this partition
                    // if we haven't seen this sample item earlier, this is not an
                    // element which crosses partitions - so ignore
                    if(!contribs.containsKey(it))
                        continue;
                    else
                        // we have seen this sample before (in a previous partInd),
                        // add to the contribution associated with this sample - if we had
                        // not seen this sample in a previous partInd, then we would have not
                        // had this in the contribs map! (because of the if above).This
                        // "key" (represented by the sample item) can either go to the
                        // previous partInd or this partInd in the final sort reduce stage.
                        // That is where the amount of contribution to each partInd will
                        // matter and influence the choice.
                        contribs.get(it).put(partInd, 1);
                }
            }
            int k = 0;
            for(Entry<Tuple, CountingMap<Integer>> ent : contribs.entrySet()){
                if (k % 1000 == 0) progress();
                Tuple key = ent.getKey(); // sample item which repeats
               
                // this map will have the contributions of the sample item to the different partitions
                CountingMap<Integer> value = ent.getValue();
               
                long total = value.getTotalCount();
                Tuple probVec =  mTupleFactory.newTuple(numQuantiles.intValue());
                // initialize all contribution fractions for different
                // partitions to 0.0
                for (int l = 0; l < numQuantiles; l++) {
                    probVec.set(l, new Float(0.0));
                }
                // for each partition that this sample item is present in,
                // compute the fraction of the total occurences for that
                // partition - this will be the probability with which we
                // will pick this partition in the final sort reduce job
                // for this sample item
                for (Entry<Integer,Integer> valEnt : value.entrySet()) {
                    probVec.set(valEnt.getKey(), (float)valEnt.getValue()/total);
                }
                weightedParts.put(key, probVec);
            }
            output.put(QUANTILES_LIST, mBagFactory.newDefaultBag(quantilesList));
            output.put(WEIGHTED_PARTS, weightedParts);
            return output;
        }catch (Exception e){
View Full Code Here

            // the Quantiles file has a tuple as under:
            // (numQuantiles, bag of samples)
            // numQuantiles here is the reduce parallelism
            Map<String, Object> quantileMap = (Map<String, Object>) t.get(0);
            quantilesList = (DataBag) quantileMap.get(FindQuantiles.QUANTILES_LIST);
            InternalMap weightedPartsData = (InternalMap) quantileMap.get(FindQuantiles.WEIGHTED_PARTS);
            convertToArray(quantilesList);
            for(Entry<Object, Object> ent : weightedPartsData.entrySet()){
                Tuple key = (Tuple)ent.getKey(); // sample item which repeats
                float[] probVec = getProbVec((Tuple)ent.getValue());
                weightedParts.put(getPigNullableWritable(key),
                        new DiscreteProbabilitySampleGenerator(probVec));
            }
View Full Code Here

                // the Quantiles file has a tuple as under:
                // (numQuantiles, bag of samples)
                // numQuantiles here is the reduce parallelism
                Map<String, Object> quantileMap = (Map<String, Object>) t.get(0);
                quantilesList = (DataBag) quantileMap.get(FindQuantiles.QUANTILES_LIST);
                InternalMap weightedPartsData = (InternalMap) quantileMap.get(FindQuantiles.WEIGHTED_PARTS);
                convertToArray(quantilesList);
                for(Entry<Object, Object> ent : weightedPartsData.entrySet()){
                    Tuple key = (Tuple)ent.getKey(); // sample item which repeats
                    float[] probVec = getProbVec((Tuple)ent.getValue());
                    weightedParts.put(getPigNullableWritable(key),
                            new DiscreteProbabilitySampleGenerator(probVec));
                }
View Full Code Here

                // the Quantiles file has a tuple as under:
                // (numQuantiles, bag of samples)
                // numQuantiles here is the reduce parallelism
                Map<String, Object> quantileMap = (Map<String, Object>) t.get(0);
                quantilesList = (DataBag) quantileMap.get(FindQuantiles.QUANTILES_LIST);
                InternalMap weightedPartsData = (InternalMap) quantileMap.get(FindQuantiles.WEIGHTED_PARTS);
                convertToArray(quantilesList);
                for(Entry<Object, Object> ent : weightedPartsData.entrySet()){
                    Tuple key = (Tuple)ent.getKey(); // sample item which repeats
                    float[] probVec = getProbVec((Tuple)ent.getValue());
                    weightedParts.put(getPigNullableWritable(key),
                            new DiscreteProbabilitySampleGenerator(probVec));
                }
View Full Code Here

        }

        long start = System.currentTimeMillis();
        try {
            DataBag quantilesList = (DataBag) quantileMap.get(FindQuantiles.QUANTILES_LIST);
            InternalMap weightedPartsData = (InternalMap) quantileMap.get(FindQuantiles.WEIGHTED_PARTS);
            estimatedNumPartitions = (Integer)quantileMap.get(PigProcessor.ESTIMATED_NUM_PARALLELISM);
            convertToArray(quantilesList);
            for (Entry<Object, Object> ent : weightedPartsData.entrySet()) {
                Tuple key = (Tuple) ent.getKey(); // sample item which repeats
                float[] probVec = getProbVec((Tuple) ent.getValue());
                weightedParts.put(getPigNullableWritable(key),
                        new DiscreteProbabilitySampleGenerator(probVec));
            }
View Full Code Here

TOP

Related Classes of org.apache.pig.data.InternalMap

Copyright © 2018 www.massapicom. All rights reserved.
All source code are property of their respective owners. Java is a trademark of Sun Microsystems, Inc and owned by ORACLE Inc. Contact coftware#gmail.com.