Package org.apache.nutch.mapReduce

Source Code of org.apache.nutch.mapReduce.TextInputFormat

/**
* Copyright 2005 The Apache Software Foundation
*
* Licensed under the Apache License, Version 2.0 (the "License");
* you may not use this file except in compliance with the License.
* You may obtain a copy of the License at
*
*     http://www.apache.org/licenses/LICENSE-2.0
*
* Unless required by applicable law or agreed to in writing, software
* distributed under the License is distributed on an "AS IS" BASIS,
* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
* See the License for the specific language governing permissions and
* limitations under the License.
*/

package org.apache.nutch.mapReduce;

import java.io.IOException;
import java.io.File;

import org.apache.nutch.fs.NutchFileSystem;
import org.apache.nutch.fs.NFSDataInputStream;

import org.apache.nutch.io.Writable;
import org.apache.nutch.io.WritableComparable;
import org.apache.nutch.io.LongWritable;
import org.apache.nutch.io.UTF8;

/** An {@link InputFormat} for plain text files.  Files are broken into lines.
* Either linefeed or carriage-return are used to signal end of line.  Keys are
* the position in the file, and values are the line of text.. */
public class TextInputFormat extends InputFormatBase {

  public String getName() { return "text"; }

  public RecordReader getRecordReader(NutchFileSystem fs, FileSplit split,
                                      JobConf job) throws IOException {

    final long start = split.getStart();
    final long end = start + split.getLength();

    // open the file and seek to the start of the split
    final NFSDataInputStream in =
      new NFSDataInputStream(fs.open(split.getFile()));
    in.seek(start);
   
    if (start != 0) {
      while (in.getPos() < end) {    // scan to the next newline in the file
        char c = (char)in.read();
        if (c == '\r' || c == '\n') {
          break;
        }
      }
    }

    return new RecordReader() {
        /** Read a line. */
        public boolean next(Writable key, Writable value) throws IOException {
          long pos = in.getPos();
          if (pos >= end)
            return false;

          ((LongWritable)key).set(pos);           // key is position
          ((UTF8)value).set(readLine(in));        // value is line
          return true;
        }
       
        public long getPos() throws IOException { return in.getPos(); }

        public void close() throws IOException { in.close(); }

      };
  }

  private static String readLine(NFSDataInputStream in) throws IOException {
    StringBuffer buffer = new StringBuffer();
    while (true) {

      int b = in.read();
      if (b == -1)
        break;

      char c = (char)b;              // bug: this assumes eight-bit characters.
      if (c == '\r' || c == '\n')
        break;

      buffer.append(c);
    }
   
    return buffer.toString();
  }

}
TOP

Related Classes of org.apache.nutch.mapReduce.TextInputFormat

TOP
Copyright © 2018 www.massapi.com. All rights reserved.
All source code are property of their respective owners. Java is a trademark of Sun Microsystems, Inc and owned by ORACLE Inc. Contact coftware#gmail.com.