Package org.archive.access.nutch

Source Code of org.archive.access.nutch.Multiple$LineInputSplit

package org.archive.access.nutch;

import java.io.DataInput;
import java.io.DataOutput;
import java.io.IOException;
import java.io.InputStream;
import java.io.InputStreamReader;
import java.io.LineNumberReader;
import java.util.ArrayList;
import java.util.List;
import java.util.Timer;
import java.util.TimerTask;

import org.apache.commons.logging.Log;
import org.apache.commons.logging.LogFactory;
import org.apache.hadoop.fs.FileSystem;
import org.apache.hadoop.fs.Path;
import org.apache.hadoop.io.Text;
import org.apache.hadoop.io.Writable;
import org.apache.hadoop.io.WritableComparable;
import org.apache.hadoop.mapred.InputFormat;
import org.apache.hadoop.mapred.InputSplit;
import org.apache.hadoop.mapred.JobClient;
import org.apache.hadoop.mapred.JobConf;
import org.apache.hadoop.mapred.Mapper;
import org.apache.hadoop.mapred.OutputCollector;
import org.apache.hadoop.mapred.RecordReader;
import org.apache.hadoop.mapred.Reporter;
import org.apache.hadoop.util.ToolBase;
import org.apache.nutch.util.NutchConfiguration;

/**
* Run multiple concurrent non-mapreduce {@link ToolBase} tasks such as
* {@link org.apache.nutch.indexer.IndexMerger} or
* {@link org.apache.nutch.indexer.IndexSorter}.
*
* Takes input that has per line the name of the class to run and the
* arguments to pass. Here is an example line for IndexMerger:
* <code>org.apache.nutch.indexer.IndexMerger -workingdir /tmp index-new
* indexes</code>. Here is one for IndexSorter:
* <code>org.apache.nutch.indexer.IndexSorter /home/stack/tmp/crawl</code>
* (Note that IndexSorter wants to refer to the local system; the indexes to
* sort must be on local disk). We run as many tasks as there are input lines.
*
* @author stack
*/
public class Multiple extends ToolBase implements Mapper
{
  public final Log LOG = LogFactory.getLog(this.getClass());
  private JobConf job;
   
  public void map(WritableComparable key, Writable value,
    OutputCollector output, final Reporter reporter)
    throws IOException
  {
    final String [] words = value.toString().split("\\s");
   
    if (words.length <= 0)
    {
      return;
    }
   
    final String className = words[0];
   
    // Set a timer running that will update reporter on a period.
    Timer t = new Timer(false);
   
    t.scheduleAtFixedRate(new TimerTask()
    {
      @Override
      public void run() {  
          reporter.setStatus("Running " + className);   
      }
    }, 0, 10000);
   
    try
    {
      int result = doMain(words);
     
      reporter.setStatus("Done running " + className + ": " + result);
     
      if (result != 0)
      {
        throw new IOException(className + " returned non-null: " +
          result + ", check logs.");
      }
    }
    finally
    {
      t.cancel();
    }
  }

/**
  * Call {@link ToolBase#doMain(org.apache.hadoop.conf.Configuration, String[])}
  * on the passed classname.
  * @param args
  * @return Result from call to doMain.
  */
  private int doMain(final String [] args)
  {
    final String className = args[0];
   
    // Redo args so absent our 'class' command.
    String [] newArgs = Nutchwax.rewriteArgs(args, 1);
    int result = -1;
   
    try
    {
      Object obj = Class.forName(className).newInstance();
      result = ((ToolBase)obj).doMain(this.job, newArgs);
    }
    catch (Exception e)
    {
      LOG.error(className, e);
    }
   
    return result;
  }

  public void configure(final JobConf j)
  {
    this.job = j;
  }

  public void close() throws IOException
  {
    // TODO Auto-generated method stub
  }

  public static class MultipleInputFormat implements InputFormat
  {
    public RecordReader getRecordReader(final InputSplit split,
      final JobConf job, final Reporter reporter)
      throws IOException
    {
      // Only one record/line to read.
      return new RecordReader()
      {
        private final String line = ((LineInputSplit)split).line;
        private boolean read = false;
       
        public void close() throws IOException
        {
          // TODO Auto-generated method stub
        }

        public WritableComparable createKey()
        {
          return new Text("");
        }

        public Writable createValue() {
          return new Text("");
        }

        public long getPos() throws IOException
        {
          return 0;
        }

        public float getProgress() throws IOException
        {
          return getPos();
        }

        public boolean next(Writable key, Writable value)
          throws IOException
        {
          if (read)
          {
            return false;
          }
         
          read = true;
         
          ((Text)value).set(this.line);

          return true;
        }
      };
    }

    public InputSplit[] getSplits(JobConf job, int numSplits)
      throws IOException
    {
      Path[] inputs = job.getInputPaths();

      List<String> lines = new ArrayList<String>();

      for (int i = 0; i < inputs.length; i++)
      {
        Path p = inputs[i];
        FileSystem fs = p.getFileSystem(job);
        Path [] ps = fs.listPaths(p);

        for (int j = 0; j < ps.length; j++)
        {
          if (fs.isDirectory(ps[j]))
          {
            continue;
          }
         
          addFileLines(lines, fs, ps[j]);
        }
      }
     
      List<LineInputSplit> splits =
        new ArrayList<LineInputSplit>(lines.size());
       
      for (String line: lines)
      {
        splits.add(new LineInputSplit(line));
      }
     
      job.setNumMapTasks(lines.size());
     
      return splits.toArray(new LineInputSplit [splits.size()]);
    }
   
    private void addFileLines(final List<String> lines, final FileSystem fs,
        final Path p)
      throws IOException
    {
      InputStream is = (InputStream)fs.open(p);
      LineNumberReader lnr = null;
     
      try
      {
        lnr = new LineNumberReader(new InputStreamReader(is));
       
        for (String l = null; (l = lnr.readLine()) != null;)
        {
          if (l.length() > 0 && !l.trim().startsWith("#"))
          {
            lines.add(l);
          }
        }
      }
      finally
      {
        if (lnr != null)
        {
          lnr.close();
        }
       
        is.close();
      }
    }

    public void validateInput(JobConf job) throws IOException
    {
      // Nothing to validate.
    }
  }
 
  public static class LineInputSplit implements InputSplit
  {
    private String line;
   
    protected LineInputSplit()
    {
      super();
    }
   
    public LineInputSplit(final String l)
    {
      line = l;
    }
   
    public long getLength() throws IOException
    {
      return line.length();
    }

    public String[] getLocations() throws IOException
    {
      return new String[0];
    }

    public void readFields(DataInput in) throws IOException
    {
      this.line = in.readLine();
    }

    public void write(DataOutput out) throws IOException
    {
      out.writeBytes(this.line);
    }
  }
 
  public static void usage()
  {
    System.out.println("Usage: multiple <input> <output>");
    System.out.println("Runs concurrently all commands listed in " +
      "<inputs>.");
    System.out.println("Arguments:");
    System.out.println(" <input>   Directory of input files with " +
      "each line describing task to run");
    System.out.println(" <output>  Output directory.");
    System.out.println("Example input lines:");
    System.out.println();
    System.out.println(" An input line to specify a merge would look like:");
    System.out.println();
    System.out.println(" org.apache.nutch.indexer.IndexMerger " +
      "-workingdir /3/hadoop-tmp index-monday indexes-monday");
    System.out.println();
    System.out.println(" Note that named class must implement " +
      "org.apache.hadoop.util.ToolBase");
    System.out.println();
    System.out.println(" To copy from " +
      "hdfs://HOST:PORT/user/stack/index-monday to");
    System.out.println( " file:///0/searcher.dir/index:");
    System.out.println();
    System.out.println(" org.apache.hadoop.fs.FsShell " +
      "/user/stack/index-monday /0/searcher.dir/index");
    System.out.println();
    System.out.println(" org.apache.nutch.indexer.IndexSorter " +
      "/home/stack/tmp/crawl");
    System.out.println();
    System.out.println(" Note that IndexSorter refers to local " +
      "filesystem and not to hdfs and is RAM-bound. Set");
    System.out.println(" task child RAM with the mapred.child.java.opts " +
      "property in your hadoop-site.xml.");
  }
 
  public int run(String[] args) throws Exception
  {
    if (args.length != 2 ||
        (args.length == 1 &&
          (args[0].equals("-h") || args[0].equals("--help"))))
    {
      usage();
      return -1;
    }
   
    JobConf job = new JobConf(MultipleInputFormat.class);
    job.setInputFormat(MultipleInputFormat.class);
    job.setInputPath(new Path(args[0]));
    job.setMapperClass(Multiple.class);
    job.setOutputPath(new Path(args[1]));
   
    JobClient.runJob(job);
   
    return 0;
  }
 
  public static void main(String[] args) throws Exception
  {
    int res = new Multiple().doMain(NutchConfiguration.create(), args);
   
    System.exit(res);
  }
}
TOP

Related Classes of org.archive.access.nutch.Multiple$LineInputSplit

TOP
Copyright © 2018 www.massapi.com. All rights reserved.
All source code are property of their respective owners. Java is a trademark of Sun Microsystems, Inc and owned by ORACLE Inc. Contact coftware#gmail.com.