Examples of org.apache.hadoop.mapred.lib.CombineFileSplit

org.apache.hadoop.mapred.lib.CombineFileSplit
A sub-collection of input files. Unlike {@link org.apache.hadoop.mapred.FileSplit}, CombineFileSplit * class does not represent a split of a file, but a split of input files into smaller sets. A split may contain blocks from different file but all the blocks in the same split are probably local to some rack
CombineFileSplit can be used to implement {@link org.apache.hadoop.mapred.RecordReader}'s, with reading one record per file. @see org.apache.hadoop.mapred.FileSplit @see CombineFileInputFormat


    public RecordReader getRecordReader(JobConf job, HadoopShims.InputSplitShim split,
        Reporter reporter,
        Class<RecordReader<K, V>> rrClass)
        throws IOException {
      CombineFileSplit cfSplit = (CombineFileSplit) split;
      return new CombineFileRecordReader(job, cfSplit, reporter, rrClass);
    }

View Full Code Here


    public RecordReader getRecordReader(JobConf job, HadoopShims.InputSplitShim split,
        Reporter reporter,
        Class<RecordReader<K, V>> rrClass)
        throws IOException {
      CombineFileSplit cfSplit = (CombineFileSplit) split;
      return new CombineFileRecordReader(job, cfSplit, reporter, rrClass);
    }

View Full Code Here


    public RecordReader getRecordReader(JobConf job, HadoopShims.InputSplitShim split,
        Reporter reporter,
        Class<RecordReader<K, V>> rrClass)
        throws IOException {
      CombineFileSplit cfSplit = (CombineFileSplit) split;
      return new CombineFileRecordReader(job, cfSplit, reporter, rrClass);
    }

View Full Code Here

    //we need to over-estimate using ceil, to ensure that the last split is not /too/ big
    final int numberOfFilesPerSplit = (int)Math.ceil((double)paths.length / (double)numSplits);
    
    int pathsUsed = 0;
    int splitnum = 0;
    CombineFileSplit mfs;
    // for each split except the last one (which may be smaller than numberOfFilesPerSplit)
    while(pathsUsed < numPaths)
    {
      /* caclulate split size for this task - usually numberOfFilesPerSplit, but
       * less than this for the last split */
      final int splitSizeForThisSplit = numberOfFilesPerSplit + pathsUsed > numPaths
        ? numPaths - pathsUsed
        : numberOfFilesPerSplit;
      //arrays of information for split
      Path[] splitPaths = new Path[splitSizeForThisSplit];
      long[] splitLengths = new long[splitSizeForThisSplit];
      long[] splitStarts = new long[splitSizeForThisSplit];
      final TObjectLongHashMap<String> allLocationsForSplit = new TObjectLongHashMap<String>();
      String[] splitLocations = null; //final recommended locations for this split.
      for(int i=0;i<splitSizeForThisSplit;i++)
      {
        locations[pathsUsed+i].forEachEntry(new  TObjectLongProcedure<String>() {
          public boolean execute(String a, long b)
          {
            allLocationsForSplit.adjustOrPutValue(a, b, b); return true;
          }
        });
        if ( allLocationsForSplit.size() <=3 )
        {
          splitLocations = allLocationsForSplit.keys(new String[allLocationsForSplit.size()]);
        }
        else
        {
          String[] hosts = allLocationsForSplit.keys(new String[allLocationsForSplit.size()]);
           Arrays.sort(hosts, new Comparator<String>() {
                        public int  compare(String o1, String o2) {
                            long diffamount = allLocationsForSplit.get(o1) - allLocationsForSplit.get(o2);
                            if (diffamount > 0)
                            {
                                return -1;
                            }
                            else if (diffamount < 0)
                            {
                                return 1;
                            }
                            return 0;
                        }
                    });
                    splitLocations = new String[3];
                    System.arraycopy(hosts, 0, splitLocations, 0, 3);
        }
      }
      
      
      //copy information for this split
      System.arraycopy(lengths, pathsUsed, splitLengths, 0, splitSizeForThisSplit);
      System.arraycopy(paths, pathsUsed, splitPaths, 0, splitSizeForThisSplit);
      //count the number of paths consumed
      pathsUsed += splitSizeForThisSplit;
      
      //make the actual split object
      ////logger.info("New split of size " + splitSizeForThisSplit);
      mfs = new CombineFileSplit(job, splitPaths, splitStarts, splitLengths, splitLocations);
      splits.add(new PositionAwareSplit<CombineFileSplit>(mfs, splitnum));
      splitnum++;
    }


    if (!(pathsUsed==paths.length)) {

View Full Code Here


    public RecordReader getRecordReader(JobConf job, HadoopShims.InputSplitShim split,
        Reporter reporter,
        Class<RecordReader<K, V>> rrClass)
        throws IOException {
      CombineFileSplit cfSplit = (CombineFileSplit) split;
      return new CombineFileRecordReader(job, cfSplit, reporter, rrClass);
    }

View Full Code Here


    public RecordReader getRecordReader(JobConf job, HadoopShims.InputSplitShim split,
        Reporter reporter,
        Class<RecordReader<K, V>> rrClass)
        throws IOException {
      CombineFileSplit cfSplit = (CombineFileSplit) split;
      return new CombineFileRecordReader(job, cfSplit, reporter, rrClass);
    }

View Full Code Here


    public RecordReader getRecordReader(JobConf job, HadoopShims.InputSplitShim split,
        Reporter reporter,
        Class<RecordReader<K, V>> rrClass)
        throws IOException {
      CombineFileSplit cfSplit = (CombineFileSplit) split;
      return new CombineFileRecordReader(job, cfSplit, reporter, rrClass);
    }

View Full Code Here

    }


    public RecordReader getRecordReader(JobConf job, HadoopShims.InputSplitShim split, Reporter reporter,
                                        Class<RecordReader<K, V>> rrClass)
      throws IOException {
      CombineFileSplit cfSplit = (CombineFileSplit)split;
      return new CombineFileRecordReader(job, cfSplit, reporter, rrClass);
    }

View Full Code Here


    public RecordReader getRecordReader(JobConf job, HadoopShims.InputSplitShim split,
        Reporter reporter,
        Class<RecordReader<K, V>> rrClass)
        throws IOException {
      CombineFileSplit cfSplit = (CombineFileSplit) split;
      return new CombineFileRecordReader(job, cfSplit, reporter, rrClass);
    }

View Full Code Here


    public RecordReader getRecordReader(JobConf job, HadoopShims.InputSplitShim split,
        Reporter reporter,
        Class<RecordReader<K, V>> rrClass)
        throws IOException {
      CombineFileSplit cfSplit = (CombineFileSplit) split;
      return new CombineFileRecordReader(job, cfSplit, reporter, rrClass);
    }

View Full Code Here

0 1 2 3

TOP

Related Classes of org.apache.hadoop.mapred.lib.CombineFileSplit

com.alexholmes.hadooputils.combine.common.mapred.SplitMetricsCombineInputFormat

org.apache.hadoop.fs.Path

org.apache.hadoop.hive.shims.Hadoop20Shims$CombineFileInputFormatShim

org.apache.hadoop.hive.shims.Hadoop20SShims$CombineFileInputFormatShim

org.apache.hadoop.hive.shims.Hadoop23Shims$CombineFileInputFormatShim

org.apache.hadoop.hive.shims.HadoopShimsSecure$CombineFileInputFormatShim

org.terrier.structures.indexing.singlepass.hadoop.MultiFileCollectionInputFormat

All source code are property of their respective owners. Java is a trademark of Sun Microsystems, Inc and owned by ORACLE Inc. Contact coftware#gmail.com.