Examples of org.apache.hadoop.mapred.TextInputFormat

org.apache.hadoop.mapred.TextInputFormat
An {@link InputFormat} for plain text files. Files are broken into lines.Either linefeed or carriage-return are used to signal end of line. Keys are the position in the file, and values are the line of text..

    this.split = new FileSplit( new Path( file ), offset, length, defaultConf); 


    this.hasMore = true;
    this.jobConf = defaultConf;
    //this.split = split;
    this.input_format = new TextInputFormat();


    try {
      this.reader = input_format.getRecordReader(this.split, this.jobConf, voidReporter);
    } catch (IOException e) {
      // TODO Auto-generated catch block

View Full Code Here


    // ---- set where we'll read the input files from -------------
    FileInputFormat.setInputPaths(job, input_path);


    // try splitting the file in a variety of sizes
    TextInputFormat format = new TextInputFormat();
    format.configure(job);


    int numSplits = 1;


    InputSplit[] splits = null;


    try {
      splits = format.getSplits(job, numSplits);
    } catch (IOException e) {
      // TODO Auto-generated catch block
      e.printStackTrace();
    }

View Full Code Here

    final int numFiles = 10;


    createFiles(length, numFiles, random);


    // create a combined split for the files
    TextInputFormat wrappedFormat = new TextInputFormat();
    wrappedFormat.configure(job);
    TezGroupedSplitsInputFormat<LongWritable , Text> format = 
        new TezGroupedSplitsInputFormat<LongWritable, Text>();
    format.setConf(job);
    format.setDesiredNumberOfSplits(1);
    format.setInputFormat(wrappedFormat);

View Full Code Here

    writeFile(localFs, new Path(workDir, "part2.txt.gz"), gzip,
              "is\ngzip\n");
    writeFile(localFs, new Path(workDir, "part3.txt.gz"), gzip,
        "one\nmore\nsplit\n");
    FileInputFormat.setInputPaths(job, workDir);
    TextInputFormat wrappedFormat = new TextInputFormat();
    wrappedFormat.configure(job);
    TezGroupedSplitsInputFormat<LongWritable , Text> format = 
        new TezGroupedSplitsInputFormat<LongWritable, Text>();
    format.setConf(job);
    format.setInputFormat(wrappedFormat);

View Full Code Here


  TextInputFormat format;
  JobConf job;


  public Base64TextInputFormat() {
    format = new TextInputFormat();
  }

View Full Code Here

  public RecordReader<LongWritable, Text> getRecordReader(
      InputSplit split, JobConf job, Reporter reporter) throws IOException {
    InputSplit targetSplit = ((SymlinkTextInputSplit)split).getTargetSplit();


    // The target data is in TextInputFormat.
    TextInputFormat inputFormat = new TextInputFormat();
    inputFormat.configure(job);
    RecordReader innerReader = null;
    try {
      innerReader = inputFormat.getRecordReader(targetSplit, job,
          reporter);
    } catch (Exception e) {
      innerReader = HiveIOExceptionHandlerUtil
          .handleRecordReaderCreationException(e, job);
    }

View Full Code Here

    if (targetPaths.size() == 0) {
      return new InputSplit[0];
    }


    // The input should be in TextInputFormat.
    TextInputFormat inputFormat = new TextInputFormat();
    JobConf newjob = new JobConf(job);
    newjob.setInputFormat(TextInputFormat.class);
    inputFormat.configure(newjob);


    List<InputSplit> result = new ArrayList<InputSplit>();


    // ceil(numSplits / numPaths), so we can get at least numSplits splits.
    int numPaths = targetPaths.size();
    int numSubSplits = (numSplits + numPaths - 1) / numPaths;


    // For each path, do getSplits().
    for (int i = 0; i < numPaths; ++i) {
      Path targetPath = targetPaths.get(i);
      Path symlinkPath = symlinkPaths.get(i);


      FileInputFormat.setInputPaths(newjob, targetPath);


      InputSplit[] iss = inputFormat.getSplits(newjob, numSubSplits);
      for (InputSplit is : iss) {
        result.add(new SymlinkTextInputSplit(symlinkPath, (FileSplit)is));
      }
    }
    return result.toArray(new InputSplit[result.size()]);

View Full Code Here

  public RecordReader<LongWritable, Text> getRecordReader(
      InputSplit split, JobConf job, Reporter reporter) throws IOException {
    InputSplit targetSplit = ((SymlinkTextInputSplit)split).getTargetSplit();


    // The target data is in TextInputFormat.
    TextInputFormat inputFormat = new TextInputFormat();
    inputFormat.configure(job);
    return inputFormat.getRecordReader(targetSplit, job, reporter);
  }

View Full Code Here

    if (targetPaths.size() == 0) {
      return new InputSplit[0];
    }


    // The input should be in TextInputFormat.
    TextInputFormat inputFormat = new TextInputFormat();
    JobConf newjob = new JobConf(job);
    newjob.setInputFormat(TextInputFormat.class);
    inputFormat.configure(newjob);


    List<InputSplit> result = new ArrayList<InputSplit>();


    // ceil(numSplits / numPaths), so we can get at least numSplits splits.
    int numPaths = targetPaths.size();
    int numSubSplits = (numSplits + numPaths - 1) / numPaths;


    // For each path, do getSplits().
    for (int i = 0; i < numPaths; ++i) {
      Path targetPath = targetPaths.get(i);
      Path symlinkPath = symlinkPaths.get(i);


      FileInputFormat.setInputPaths(newjob, targetPath);


      InputSplit[] iss = inputFormat.getSplits(newjob, numSubSplits);
      for (InputSplit is : iss) {
        result.add(new SymlinkTextInputSplit(symlinkPath, (FileSplit)is));
      }
    }
    return result.toArray(new InputSplit[result.size()]);

View Full Code Here

  private static class TextRecordReaderWrapper
    extends org.apache.hadoop.mapred.lib.CombineFileRecordReaderWrapper<LongWritable,Text> {
    // this constructor signature is required by CombineFileRecordReader
    public TextRecordReaderWrapper(CombineFileSplit split, Configuration conf,
      Reporter reporter, Integer idx) throws IOException {
      super(new TextInputFormat(), split, conf, reporter, idx);
    }

View Full Code Here

0 1 2 3 4

TOP

Related Classes of org.apache.hadoop.mapred.TextInputFormat

com.cloudera.iterativereduce.io.TextRecordParser

com.cloudera.iterativereduce.irunit.IRUnitDriver

com.m6d.filecrush.crush.KeyValuePreservingTextInputFormat

eu.stratosphere.hadoopcompatibility.example.WordCount

eu.stratosphere.hadoopcompatibility.example.WordCountWithHadoopOutputFormat

org.apache.drill.exec.store.text.DrillTextRecordReader

org.apache.flink.hadoopcompatibility.mapred.example.HadoopMapredCompatWordCount

org.apache.flink.hadoopcompatibility.mapred.record.example.WordCount

org.apache.flink.hadoopcompatibility.mapred.record.example.WordCountWithOutputFormat

org.apache.hadoop.fs.FSDataInputStream

All source code are property of their respective owners. Java is a trademark of Sun Microsystems, Inc and owned by ORACLE Inc. Contact coftware#gmail.com.