Examples of TextInputFormat


Examples of org.apache.hadoop.mapred.TextInputFormat

    writeFile(localFs, new Path(workDir, "part2.txt.gz"), gzip,
              "is\ngzip\n");
    writeFile(localFs, new Path(workDir, "part3.txt.gz"), gzip,
        "one\nmore\nsplit\n");
    FileInputFormat.setInputPaths(job, workDir);
    TextInputFormat wrappedFormat = new TextInputFormat();
    wrappedFormat.configure(job);
    TezGroupedSplitsInputFormat<LongWritable , Text> format =
        new TezGroupedSplitsInputFormat<LongWritable, Text>();
    format.setConf(job);
    format.setInputFormat(wrappedFormat);
   
View Full Code Here

Examples of org.apache.hadoop.mapred.TextInputFormat

  }

  @Override
  protected void runJob(JobConf job) throws IOException {
    // retrieve the splits
    TextInputFormat input = (TextInputFormat) job.getInputFormat();
    InputSplit[] splits = input.getSplits(job, job.getNumMapTasks());
    log.debug("Nb splits : " + splits.length);

    InputSplit[] sorted = Arrays.copyOf(splits, splits.length);
    Builder.sortSplits(sorted);

    int numTrees = Builder.getNbTrees(job); // total number of trees

    firstOutput = new PartialOutputCollector(numTrees);
    Reporter reporter = Reporter.NULL;

    firstIds = new int[splits.length];
    sizes = new int[splits.length];
   
    // to compute firstIds, process the splits in file order
    int firstId = 0;
    long slowest = 0; // duration of slowest map
    for (InputSplit split : splits) {
      int hp = ArrayUtils.indexOf(sorted, split); // hadoop's partition

      RecordReader<LongWritable, Text> reader = input.getRecordReader(split, job, reporter);

      LongWritable key = reader.createKey();
      Text value = reader.createValue();

      Step1Mapper mapper = new MockStep1Mapper(getTreeBuilder(), dataset, getSeed(),
View Full Code Here

Examples of org.apache.hadoop.mapred.TextInputFormat

   *
   */
  protected void secondStep(JobConf job, Path forestPath,
      PredictionCallback callback) throws IOException {
    // retrieve the splits
    TextInputFormat input = (TextInputFormat) job.getInputFormat();
    InputSplit[] splits = input.getSplits(job, job.getNumMapTasks());
    log.debug("Nb splits : " + splits.length);

    Builder.sortSplits(splits);

    int numTrees = Builder.getNbTrees(job); // total number of trees

    // compute the expected number of outputs
    int total = 0;
    for (int p = 0; p < splits.length; p++) {
      total += Step2Mapper.nbConcerned(splits.length, numTrees, p);
    }

    secondOutput = new PartialOutputCollector(total);
    Reporter reporter = Reporter.NULL;
    long slowest = 0; // duration of slowest map

    for (int partition = 0; partition < splits.length; partition++) {
      InputSplit split = splits[partition];
      RecordReader<LongWritable, Text> reader = input.getRecordReader(split,
          job, reporter);

      LongWritable key = reader.createKey();
      Text value = reader.createValue();

View Full Code Here

Examples of org.apache.hadoop.mapred.TextInputFormat

    job.setNumMapTasks(numMaps);

    FileInputFormat.setInputPaths(job, dataPath);

    // retrieve the splits
    TextInputFormat input = (TextInputFormat) job.getInputFormat();
    InputSplit[] splits = input.getSplits(job, numMaps);

    InputSplit[] sorted = Arrays.copyOf(splits, splits.length);
    Builder.sortSplits(sorted);

    Step0OutputCollector collector = new Step0OutputCollector(numMaps);
    Reporter reporter = Reporter.NULL;

    for (int p = 0; p < numMaps; p++) {
      InputSplit split = sorted[p];
      RecordReader<LongWritable, Text> reader = input.getRecordReader(split, job, reporter);

      LongWritable key = reader.createKey();
      Text value = reader.createValue();

      Step0Mapper mapper = new Step0Mapper();
View Full Code Here

Examples of org.apache.hadoop.mapred.TextInputFormat

    JobConf job = new JobConf();
    job.setNumMapTasks(numMaps);
    FileInputFormat.setInputPaths(job, dataPath);

    // retrieve the splits
    TextInputFormat input = (TextInputFormat) job.getInputFormat();
    InputSplit[] splits = input.getSplits(job, numMaps);

    InputSplit[] sorted = Arrays.copyOf(splits, splits.length);
    Builder.sortSplits(sorted);

    Reporter reporter = Reporter.NULL;

    int[] keys = new int[numMaps];
    Step0Output[] values = new Step0Output[numMaps];
   
    int[] expectedIds = new int[numMaps];
   
    for (int p = 0; p < numMaps; p++) {
      InputSplit split = sorted[p];
      RecordReader<LongWritable, Text> reader = input.getRecordReader(split, job, reporter);

      LongWritable key = reader.createKey();
      Text value = reader.createValue();

      Long firstKey = null;
View Full Code Here

Examples of org.apache.hadoop.mapred.TextInputFormat

    final int numFiles = 10;

    createFiles(length, numFiles, random);

    // create a combined split for the files
    TextInputFormat wrappedFormat = new TextInputFormat();
    wrappedFormat.configure(job);
    TezGroupedSplitsInputFormat<LongWritable , Text> format =
        new TezGroupedSplitsInputFormat<LongWritable, Text>();
    format.setConf(job);
    format.setDesiredNumberOfSplits(1);
    format.setInputFormat(wrappedFormat);
View Full Code Here

Examples of org.apache.hadoop.mapred.TextInputFormat

    writeFile(localFs, new Path(workDir, "part2.txt.gz"), gzip,
              "is\ngzip\n");
    writeFile(localFs, new Path(workDir, "part3.txt.gz"), gzip,
        "one\nmore\nsplit\n");
    FileInputFormat.setInputPaths(job, workDir);
    TextInputFormat wrappedFormat = new TextInputFormat();
    wrappedFormat.configure(job);
    TezGroupedSplitsInputFormat<LongWritable , Text> format =
        new TezGroupedSplitsInputFormat<LongWritable, Text>();
    format.setConf(job);
    format.setInputFormat(wrappedFormat);
   
View Full Code Here

Examples of org.apache.hadoop.mapred.TextInputFormat

      }
      Collections.sort(columnIds);
    }
    targetRecordCount = context.getConfig().getInt(ExecConstants.TEXT_LINE_READER_BATCH_SIZE);
    numCols = columnIds.size();
    TextInputFormat inputFormat = new TextInputFormat();
    JobConf job = new JobConf();
    job.setInt("io.file.buffer.size", context.getConfig().getInt(ExecConstants.TEXT_LINE_READER_BUFFER_SIZE));
    job.setInputFormat(inputFormat.getClass());
    try {
      reader = inputFormat.getRecordReader(split, job, Reporter.NULL);
      key = reader.createKey();
      value = reader.createValue();
    } catch (IOException e) {
      throw new RuntimeException(e);
    }
View Full Code Here

Examples of org.apache.hadoop.mapred.TextInputFormat

    this.split = new FileSplit( new Path( file ), offset, length, defaultConf);

    this.hasMore = true;
    this.jobConf = defaultConf;
    //this.split = split;
    this.input_format = new TextInputFormat();

    try {
      this.reader = input_format.getRecordReader(this.split, this.jobConf, voidReporter);
    } catch (IOException e) {
      // TODO Auto-generated catch block
View Full Code Here

Examples of org.apache.hadoop.mapred.TextInputFormat

    // ---- set where we'll read the input files from -------------
    FileInputFormat.setInputPaths(job, input_path);

    // try splitting the file in a variety of sizes
    TextInputFormat format = new TextInputFormat();
    format.configure(job);

    int numSplits = 1;

    InputSplit[] splits = null;

    try {
      splits = format.getSplits(job, numSplits);
    } catch (IOException e) {
      // TODO Auto-generated catch block
      e.printStackTrace();
    }
View Full Code Here
TOP
Copyright © 2018 www.massapi.com. All rights reserved.
All source code are property of their respective owners. Java is a trademark of Sun Microsystems, Inc and owned by ORACLE Inc. Contact coftware#gmail.com.