Examples of org.apache.pig.data.InternalMap

org.apache.pig.data.InternalMap
This class is an empty extension of Map. It only exists so that DataType.findType() can distinguish an internal map type that maps object to object from an external map type that is string to object.

        if(in==null || in.size()==0)
            return null;
        Integer numQuantiles = null;
        DataBag samples = null;
        ArrayList<Tuple> quantilesList = new ArrayList<Tuple>();
        InternalMap weightedParts = new InternalMap();
        // the sample file has a tuple as under:
        // (numQuantiles, bag of samples) 
        // numQuantiles here is the reduce parallelism
        try{
            numQuantiles = (Integer)in.get(0);
            samples = (DataBag)in.get(1);
            
            long numSamples = samples.size();
            long toSkip = numSamples / numQuantiles;
            if(toSkip == 0) {
                // numSamples is < numQuantiles;
                // set numQuantiles to numSamples
                numQuantiles = (int)numSamples;
                toSkip = 1;
            }
            
            long ind=0, j=-1, nextQuantile = toSkip-1;
            for (Tuple it : samples) {
                if (ind==nextQuantile){
                    ++j;
                    quantilesList.add(it);
                    nextQuantile+=toSkip;
                    if(j==numQuantiles-1)
                        break;
                }
                ind++;
                if (ind % 1000 == 0) progress();
            }
            long i=-1;
            Map<Tuple,CountingMap<Integer>> contribs = new HashMap<Tuple, CountingMap<Integer>>();
            for (Tuple it : samples){
                ++i;
                if (i % 1000 == 0) progress();
                int partInd = (int)(i/toSkip); // which partition
                if(partInd==numQuantiles) break;
                // the quantiles array has the element from the sample which is the
                // last element for a given partition. For example: if numQuantiles 
                // is 5 and number of samples is 100, then toSkip = 20 
                // quantiles[0] = sample[19] // the 20th element
                // quantiles[1] = sample[39] // the 40th element
                // and so on. For any element in the sample between 0 and 19, partInd
                // will be 0. We want to check if a sample element which is
                // present between 0 and 19 is also the 19th (quantiles[0] element).
                // This would mean that element might spread over the 0th and 1st 
                // partition. We are looking for contributions to a partition
                // from such elements. 
                
                // First We only check for sample elements in partitions other than the last one
                // < numQuantiles -1 (partInd is 0 indexed). 
                if(partInd<numQuantiles-1 && areEqual(it,quantilesList.get(partInd))){
                    if(!contribs.containsKey(it)){
                        CountingMap<Integer> cm = new CountingMap<Integer>();
                        cm.put(partInd, 1);
                        contribs.put(it, cm);
                    }
                    else
                        contribs.get(it).put(partInd, 1);
                }
                else{ 
                    // we are either in the last partition (last quantile)
                    // OR the sample element we are currently processing is not
                    // the same as the element in the quantile array for this partition
                    // if we haven't seen this sample item earlier, this is not an
                    // element which crosses partitions - so ignore
                    if(!contribs.containsKey(it))
                        continue;
                    else
                        // we have seen this sample before (in a previous partInd), 
                        // add to the contribution associated with this sample - if we had 
                        // not seen this sample in a previous partInd, then we would have not
                        // had this in the contribs map! (because of the if above).This 
                        // "key" (represented by the sample item) can either go to the 
                        // previous partInd or this partInd in the final sort reduce stage. 
                        // That is where the amount of contribution to each partInd will
                        // matter and influence the choice.
                        contribs.get(it).put(partInd, 1);
                }
            }
            int k = 0;
            for(Entry<Tuple, CountingMap<Integer>> ent : contribs.entrySet()){
                if (k % 1000 == 0) progress();
                Tuple key = ent.getKey(); // sample item which repeats
                
                // this map will have the contributions of the sample item to the different partitions
                CountingMap<Integer> value = ent.getValue(); 
                
                long total = value.getTotalCount();
                Tuple probVec =  mTupleFactory.newTuple(numQuantiles.intValue());
                // initialize all contribution fractions for different
                // partitions to 0.0
                for (int l = 0; l < numQuantiles; l++) {
                    probVec.set(l, new Float(0.0));
                }
                // for each partition that this sample item is present in,
                // compute the fraction of the total occurences for that
                // partition - this will be the probability with which we
                // will pick this partition in the final sort reduce job
                // for this sample item
                for (Entry<Integer,Integer> valEnt : value.entrySet()) {
                    probVec.set(valEnt.getKey(), (float)valEnt.getValue()/total);
                }
                weightedParts.put(key, probVec);
            }
            output.put(QUANTILES_LIST, mBagFactory.newDefaultBag(quantilesList));
            output.put(WEIGHTED_PARTS, weightedParts);
            return output;
        }catch (Exception e){

View Full Code Here

     * @param map
     *            LazyMap
     * @return InternalMap
     */
    public static InternalMap parseLazyMapToPigMap(LazyMap map) {
  InternalMap pigmap = new InternalMap();


  Map<Object, Object> javamap = map.getMap();


  if (javamap != null) {


      // for each item in the map extract the java primitive type
      for (Entry<Object, Object> entry : javamap.entrySet()) {
    pigmap.put(extractPigTypeFromHiveType(entry.getKey()),
      extractPigTypeFromHiveType(entry.getValue()));
      }


  }

View Full Code Here

                // the Quantiles file has a tuple as under:
                // (numQuantiles, bag of samples) 
                // numQuantiles here is the reduce parallelism
                Map<String, Object> quantileMap = (Map<String, Object>) t.get(0);
                quantilesList = (DataBag) quantileMap.get(FindQuantiles.QUANTILES_LIST);
                InternalMap weightedPartsData = (InternalMap) quantileMap.get(FindQuantiles.WEIGHTED_PARTS);
                convertToArray(quantilesList);
                for(Entry<Object, Object> ent : weightedPartsData.entrySet()){
                    Tuple key = (Tuple)ent.getKey(); // sample item which repeats
                    float[] probVec = getProbVec((Tuple)ent.getValue());
                    weightedParts.put(getPigNullableWritable(key), 
                            new DiscreteProbabilitySampleGenerator(probVec));
                }

View Full Code Here

     * @param map
     *            LazyMap
     * @return InternalMap
     */
    public static InternalMap parseLazyMapToPigMap(LazyMap map) {
        InternalMap pigmap = new InternalMap();


        Map<Object, Object> javamap = map.getMap();


        if (javamap != null) {


            // for each item in the map extract the java primitive type
            for (Entry<Object, Object> entry : javamap.entrySet()) {
                pigmap.put(extractPigTypeFromHiveType(entry.getKey()),
                        extractPigTypeFromHiveType(entry.getValue()));
            }


        }

View Full Code Here

                // the Quantiles file has a tuple as under:
                // (numQuantiles, bag of samples) 
                // numQuantiles here is the reduce parallelism
                Map<String, Object> quantileMap = (Map<String, Object>) t.get(0);
                quantilesList = (DataBag) quantileMap.get(FindQuantiles.QUANTILES_LIST);
                InternalMap weightedPartsData = (InternalMap) quantileMap.get(FindQuantiles.WEIGHTED_PARTS);
                convertToArray(quantilesList);
                for(Entry<Object, Object> ent : weightedPartsData.entrySet()){
                    Tuple key = (Tuple)ent.getKey(); // sample item which repeats
                    float[] probVec = getProbVec((Tuple)ent.getValue());
                    weightedParts.put(getPigNullableWritable(key), 
                            new DiscreteProbabilitySampleGenerator(probVec));
                }

View Full Code Here

        if(in==null || in.size()==0)
            return null;
        Integer numQuantiles = null;
        DataBag samples = null;
        ArrayList<Tuple> quantilesList = new ArrayList<Tuple>();
        InternalMap weightedParts = new InternalMap();
        // the sample file has a tuple as under:
        // (numQuantiles, bag of samples) 
        // numQuantiles here is the reduce parallelism
        try{
            numQuantiles = (Integer)in.get(0);
            samples = (DataBag)in.get(1);
            
            long numSamples = samples.size();
            long toSkip = numSamples / numQuantiles;
            if(toSkip == 0) {
                // numSamples is < numQuantiles;
                // set numQuantiles to numSamples
                numQuantiles = (int)numSamples;
                toSkip = 1;
            }
            
            long ind=0, j=-1, nextQuantile = toSkip-1;
            for (Tuple it : samples) {
                if (ind==nextQuantile){
                    ++j;
                    quantilesList.add(it);
                    nextQuantile+=toSkip;
                    if(j==numQuantiles-1)
                        break;
                }
                ind++;
                if (ind % 1000 == 0) progress();
            }
            long i=-1;
            Map<Tuple,CountingMap<Integer>> contribs = new HashMap<Tuple, CountingMap<Integer>>();
            for (Tuple it : samples){
                ++i;
                if (i % 1000 == 0) progress();
                int partInd = new Long(i/toSkip).intValue(); // which partition
                if(partInd==numQuantiles) break;
                // the quantiles array has the element from the sample which is the
                // last element for a given partition. For example: if numQuantiles 
                // is 5 and number of samples is 100, then toSkip = 20 
                // quantiles[0] = sample[19] // the 20th element
                // quantiles[1] = sample[39] // the 40th element
                // and so on. For any element in the sample between 0 and 19, partInd
                // will be 0. We want to check if a sample element which is
                // present between 0 and 19 is also the 19th (quantiles[0] element).
                // This would mean that element might spread over the 0th and 1st 
                // partition. We are looking for contributions to a partition
                // from such elements. 
                
                // First We only check for sample elements in partitions other than the last one
                // < numQuantiles -1 (partInd is 0 indexed). 
                if(partInd<numQuantiles-1 && areEqual(it,quantilesList.get(partInd))){
                    if(!contribs.containsKey(it)){
                        CountingMap<Integer> cm = new CountingMap<Integer>();
                        cm.put(partInd, 1);
                        contribs.put(it, cm);
                    }
                    else
                        contribs.get(it).put(partInd, 1);
                }
                else{ 
                    // we are either in the last partition (last quantile)
                    // OR the sample element we are currently processing is not
                    // the same as the element in the quantile array for this partition
                    // if we haven't seen this sample item earlier, this is not an
                    // element which crosses partitions - so ignore
                    if(!contribs.containsKey(it))
                        continue;
                    else
                        // we have seen this sample before (in a previous partInd), 
                        // add to the contribution associated with this sample - if we had 
                        // not seen this sample in a previous partInd, then we would have not
                        // had this in the contribs map! (because of the if above).This 
                        // "key" (represented by the sample item) can either go to the 
                        // previous partInd or this partInd in the final sort reduce stage. 
                        // That is where the amount of contribution to each partInd will
                        // matter and influence the choice.
                        contribs.get(it).put(partInd, 1);
                }
            }
            int k = 0;
            for(Entry<Tuple, CountingMap<Integer>> ent : contribs.entrySet()){
                if (k % 1000 == 0) progress();
                Tuple key = ent.getKey(); // sample item which repeats
                
                // this map will have the contributions of the sample item to the different partitions
                CountingMap<Integer> value = ent.getValue(); 
                
                long total = value.getTotalCount();
                Tuple probVec =  mTupleFactory.newTuple(numQuantiles.intValue());
                // initialize all contribution fractions for different
                // partitions to 0.0
                for (int l = 0; l < numQuantiles; l++) {
                    probVec.set(l, new Float(0.0));
                }
                // for each partition that this sample item is present in,
                // compute the fraction of the total occurences for that
                // partition - this will be the probability with which we
                // will pick this partition in the final sort reduce job
                // for this sample item
                for (Entry<Integer,Integer> valEnt : value.entrySet()) {
                    probVec.set(valEnt.getKey(), (float)valEnt.getValue()/total);
                }
                weightedParts.put(key, probVec);
            }
            output.put(QUANTILES_LIST, mBagFactory.newDefaultBag(quantilesList));
            output.put(WEIGHTED_PARTS, weightedParts);
            return output;
        }catch (Exception e){

View Full Code Here

            // the Quantiles file has a tuple as under:
            // (numQuantiles, bag of samples) 
            // numQuantiles here is the reduce parallelism
            Map<String, Object> quantileMap = (Map<String, Object>) t.get(0);
            quantilesList = (DataBag) quantileMap.get(FindQuantiles.QUANTILES_LIST);
            InternalMap weightedPartsData = (InternalMap) quantileMap.get(FindQuantiles.WEIGHTED_PARTS);
            convertToArray(quantilesList);
            for(Entry<Object, Object> ent : weightedPartsData.entrySet()){
                Tuple key = (Tuple)ent.getKey(); // sample item which repeats
                float[] probVec = getProbVec((Tuple)ent.getValue());
                weightedParts.put(getPigNullableWritable(key), 
                        new DiscreteProbabilitySampleGenerator(probVec));
            }

View Full Code Here

                // the Quantiles file has a tuple as under:
                // (numQuantiles, bag of samples) 
                // numQuantiles here is the reduce parallelism
                Map<String, Object> quantileMap = (Map<String, Object>) t.get(0);
                quantilesList = (DataBag) quantileMap.get(FindQuantiles.QUANTILES_LIST);
                InternalMap weightedPartsData = (InternalMap) quantileMap.get(FindQuantiles.WEIGHTED_PARTS);
                convertToArray(quantilesList);
                for(Entry<Object, Object> ent : weightedPartsData.entrySet()){
                    Tuple key = (Tuple)ent.getKey(); // sample item which repeats
                    float[] probVec = getProbVec((Tuple)ent.getValue());
                    weightedParts.put(getPigNullableWritable(key), 
                            new DiscreteProbabilitySampleGenerator(probVec));
                }

View Full Code Here

                // the Quantiles file has a tuple as under:
                // (numQuantiles, bag of samples)
                // numQuantiles here is the reduce parallelism
                Map<String, Object> quantileMap = (Map<String, Object>) t.get(0);
                quantilesList = (DataBag) quantileMap.get(FindQuantiles.QUANTILES_LIST);
                InternalMap weightedPartsData = (InternalMap) quantileMap.get(FindQuantiles.WEIGHTED_PARTS);
                convertToArray(quantilesList);
                for(Entry<Object, Object> ent : weightedPartsData.entrySet()){
                    Tuple key = (Tuple)ent.getKey(); // sample item which repeats
                    float[] probVec = getProbVec((Tuple)ent.getValue());
                    weightedParts.put(getPigNullableWritable(key),
                            new DiscreteProbabilitySampleGenerator(probVec));
                }

View Full Code Here

        }


        long start = System.currentTimeMillis();
        try {
            DataBag quantilesList = (DataBag) quantileMap.get(FindQuantiles.QUANTILES_LIST);
            InternalMap weightedPartsData = (InternalMap) quantileMap.get(FindQuantiles.WEIGHTED_PARTS);
            estimatedNumPartitions = (Integer)quantileMap.get(PigProcessor.ESTIMATED_NUM_PARALLELISM);
            convertToArray(quantilesList);
            for (Entry<Object, Object> ent : weightedPartsData.entrySet()) {
                Tuple key = (Tuple) ent.getKey(); // sample item which repeats
                float[] probVec = getProbVec((Tuple) ent.getValue());
                weightedParts.put(getPigNullableWritable(key),
                        new DiscreteProbabilitySampleGenerator(probVec));
            }

View Full Code Here

0 1 2

TOP

Related Classes of org.apache.pig.data.InternalMap

org.apache.pig.backend.hadoop.accumulo.TestAccumuloStorage

org.apache.pig.backend.hadoop.executionengine.mapReduceLayer.partitioners.WeightedRangePartitioner

org.apache.pig.backend.hadoop.executionengine.tez.runtime.WeightedRangePartitionerTez

org.apache.pig.impl.builtin.FindQuantiles

org.apache.pig.piggybank.storage.hiverc.HiveRCSchemaUtil

org.apache.pig.test.TestDataModel

org.apache.pig.test.TestFindQuantiles

All source code are property of their respective owners. Java is a trademark of Sun Microsystems, Inc and owned by ORACLE Inc. Contact coftware#gmail.com.