Source Code of org.archive.access.nutch.Nutchwax$OutputDirectories

/* Nutchwax
 * 
 * $Id: Nutchwax.java 1896 2007-08-01 21:44:31Z jlee-archive $
 * 
 * Created on Feb 14, 2006
 *
 * Copyright (C) 2006 Internet Archive.
 * 
 * This file is part of the Heritrix web crawler (crawler.archive.org).
 * 
 * Heritrix is free software; you can redistribute it and/or modify
 * it under the terms of the GNU Lesser Public License as published by
 * the Free Software Foundation; either version 2.1 of the License, or
 * any later version.
 * 
 * Heritrix is distributed in the hope that it will be useful, 
 * but WITHOUT ANY WARRANTY; without even the implied warranty of
 * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
 * GNU Lesser Public License for more details.
 * 
 * You should have received a copy of the GNU Lesser Public License
 * along with Heritrix; if not, write to the Free Software
 * Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA  02111-1307  USA
 */
package org.archive.access.nutch;


import java.io.FileNotFoundException;
import java.io.IOException;
import java.lang.reflect.Method;
import java.util.ArrayList;
import java.util.Arrays;
import java.util.List;
import java.util.regex.Matcher;
import java.util.regex.Pattern;


import org.apache.commons.logging.Log;
import org.apache.commons.logging.LogFactory;
import org.apache.hadoop.fs.FileSystem;
import org.apache.hadoop.fs.Path;
import org.apache.hadoop.io.Text;
import org.apache.hadoop.io.WritableComparable;
import org.apache.hadoop.mapred.JobConf;
import org.apache.nutch.crawl.CrawlDb;
import org.apache.nutch.crawl.Generator;
import org.apache.nutch.indexer.DeleteDuplicates;
import org.apache.nutch.indexer.IndexMerger;
import org.archive.access.nutch.jobs.ImportArcs;
import org.archive.access.nutch.jobs.NutchwaxCrawlDb;
import org.archive.access.nutch.jobs.NutchwaxIndexer;
import org.archive.access.nutch.jobs.NutchwaxLinkDb;
import org.archive.access.nutch.jobs.NutchwaxPagerank;   
import org.archive.util.ArchiveUtils;
import org.apache.nutch.global.Global;






/**
 * Script to run all indexing jobs from index through merge of final index.
 */
public class Nutchwax
{
  public static final Log LOG =
    LogFactory.getLog(Nutchwax.class.getName());
    
  private static final String KEY_COLLECTION_PREFIX = "c=";
  private static final String KEY_COLLECTION_SUFFIX = ",u=";
  private static final Pattern COLLECTION =
    Pattern.compile("^\\s*c=([^,]+),u=(.*)\\s*", Pattern.DOTALL);


  private final static List JOBS = Arrays.asList(new String[] {
    "import", "update", "invert", "pagerank", "index", "dedup", "merge", "all",
    "class", "search", "multiple","version"});
    


  // Lazy initialize these two variables to delay complaint about hadoop not
  // being present -- if its not.  Meantime I get command-line processing
  // done.
  private FileSystem fs = null;
  private JobConf conf = null;
    
  /**
   * Default constructor.
   * @throws IOException 
   */
  public Nutchwax() throws IOException
  {
    super();
  }
    
  public synchronized JobConf getJobConf()
  {
    if (this.conf == null) {
      this.conf = new JobConf(NutchwaxConfiguration.getConfiguration());
    }
     
    return this.conf;
  }
    
  public synchronized FileSystem getFS() throws IOException
  {
    if (this.fs == null) {
      this.fs = FileSystem.get(getJobConf());
    }
     
    return this.fs;
  }
    
  public class OutputDirectories
  {
    private final Path output;
    private final Path crawlDb;
    private final Path linkDb;
    private final Path pagerank;
    private final Path segments;
    private final Path indexes;
    private final Path index;
    private final Path tmpDir;


    public OutputDirectories(final Path output) throws IOException
    {
      this.output = output;
      this.crawlDb = new Path(output + "/crawldb");
      this.linkDb = new Path(output + "/linkdb");
      this.pagerank = new Path(output + "/pagerank"); 
      this.segments = new Path(output + "/segments");
      this.indexes = new Path(output + "/indexes");
      this.index = new Path(output + "/index");
      this.tmpDir = getJobConf().getLocalPath("mapred.temp.dir",
        Generator.generateSegmentName());
    }


    public Path getCrawlDb()
    {
      return crawlDb;
    }


    public Path getIndexes()
    {
      return indexes;
    }


    public Path getLinkDb()
    {
      return linkDb;
    }
    
    public Path getPagerank()
    {
      return pagerank;
    }


    public Path getSegments()
    {
      return segments;
    }


    public Path getTmpDir()
    {
      return tmpDir;
    }


    public Path getIndex()
    {
      return index;
    }


    public Path getOutput()
    {
      return output;
    }
  }


  /**
   * Run passed list of mapreduce indexing jobs. Jobs are always run in
   * order: import, update, etc.
   * 
   * @throws Exception
   */
  protected void doAll(final Path input, final String collectionName,
    final OutputDirectories od)
    throws Exception
  {
    doImport(input, collectionName, od);
    doUpdate(od);
    doInvert(od);
    doPagerank(od); 
    doIndexing(od);
    doDedup(od);
    doMerge(od);
      
    LOG.info("Nutchwax finished.");
  }
    
  protected void doImport(final Path input, String collectionName,
    final OutputDirectories od)
    throws IOException
  {
    Path segment = new Path(od.getSegments(),
      Generator.generateSegmentName() +
        ((collectionName == null || collectionName.length() <= 0)?
          "": "-" + collectionName));
        
    new ImportArcs(getJobConf()).importArcs(input, segment, collectionName);
  }
    
  protected void doUpdate(final OutputDirectories od)
    throws IOException
  {
    doUpdate(od, null);
  }
    
  protected void doUpdate(final OutputDirectories od,
    final String[] segments)
    throws IOException
  {
    LOG.info("updating crawldb " + od.getCrawlDb());
      
    // Need to make sure the db dir exists before progressing.
    Path dbPath = new Path(od.getCrawlDb(), CrawlDb.CURRENT_NAME);
        
    if (!getFS().exists(dbPath))
    {
      getFS().mkdirs(dbPath);
    }
      
    CrawlDb cdb = new NutchwaxCrawlDb(getJobConf());
      
    if (segments != null)
    {
      List<Path> paths = new ArrayList<Path>(segments.length);
        
      for (int i = 0; i < segments.length; i++)
      {
        Path p = new Path(segments[i]);
          
        if (!getFS().exists(p))
        {
          throw new FileNotFoundException(p.toString());
        }


        paths.add(p);
      }
        
      cdb.update(od.getCrawlDb(), paths.toArray(new Path[paths.size()]),
        true, true);
    }
    else
    {
      Path[] allSegments = getSegments(od);
        
      // This just does the last segment created.
      cdb.update(od.getCrawlDb(),
        new Path[] {allSegments[allSegments.length - 1]}, true, true);
    }
  }


  protected Path [] getSegments(final OutputDirectories od)
    throws IOException
  {
    Path[] allSegments = getFS().listPaths(od.getSegments());
        
    if (allSegments == null || allSegments.length <= 0)
    {
      throw new FileNotFoundException(od.getSegments().toString());
    }
      
    return allSegments;
  }
    
  protected void doInvert(final OutputDirectories od, final Path [] segments)
    throws IOException
  {
    createLinkdb(od);
      
    new NutchwaxLinkDb(getJobConf()).
      invert(od.getLinkDb(), segments, true, true, false);
  }
    
  protected void doInvert(final OutputDirectories od)
    throws IOException
  {
    LOG.info("inverting links in " + od.getSegments());


    new NutchwaxLinkDb(getJobConf()).
      invert(od.getLinkDb(), getSegments(od), true, true, false);
  }
 
  protected boolean createLinkdb(final OutputDirectories od)
    throws IOException
  {
    boolean result = false;


    // Make sure the linkdb exists.  Otherwise the install where
    // the temporary location gets moved to the permanent fails.
    if (getFS().mkdirs(new Path(od.getLinkDb(),
      NutchwaxLinkDb.CURRENT_NAME)))
    {
      LOG.info("Created " + od.getLinkDb());


      result = true;
    }


    return result;
  }
 
  protected void doPagerank(final OutputDirectories od)
  throws IOException
  {   
    LOG.info("computing pagerank scores in " + od.getPagerank());
        
    new NutchwaxPagerank(getJobConf()).process(getSegments(od), od.getPagerank());
  }
  
  protected void doIndexing(final OutputDirectories od)
    throws IOException
  {
    doIndexing(od, getFS().listPaths(od.getSegments()));
  }


  protected void doIndexing(final OutputDirectories od,
    final Path [] segments)
    throws IOException
  {
    LOG.info(" indexing " + segments);


    new NutchwaxIndexer(getJobConf()).index(od.getIndexes(), od.getPagerank(), od.getCrawlDb(), od.getLinkDb(), segments);
  }


  protected void doDedup(final OutputDirectories od) throws IOException
  {
    LOG.info("dedup " + od.getIndex());


    new DeleteDuplicates(getJobConf()).dedup(new Path[] {od.getIndexes()});
  }
    
  protected void doMerge(final OutputDirectories od) throws IOException
  {
    LOG.info("index merge " + od.getOutput() + " using tmpDir=" +
      od.getTmpDir());


    new IndexMerger(getJobConf()).merge(getFS().listPaths(od.getIndexes()),
      od.getIndex(), od.getTmpDir());
  }


  static String [] rewriteArgs(final String [] args, final int offset)
  {
    final String [] newArgs = new String[args.length - offset];


    for (int i = 0; i < args.length; i++)
    {
      if (i < offset)
      {
        continue;
      }


      newArgs[i - offset] = args[i];
    }


    return newArgs;
  }


  static Object doClassMain(final String [] args)
  {
    // Redo args so absent our nutchwax 'class' command.
    final String className = args[1];
    String [] newArgs = rewriteArgs(args, 2);


    // From http://www.javaworld.com/javaworld/javaqa/1999-06/01-outside.html
    Class [] argTypes = new Class[1];
    argTypes[0] = String[].class;
    Object result = null;


    try
    {
      Method mainMethod =
        Class.forName(className).getDeclaredMethod("main", argTypes);
      result = mainMethod.invoke(newArgs, new Object [] {newArgs});
    }
    catch (Throwable t)
    {
      t.printStackTrace();
    }


    return result;
  }
    
  protected Object doSearch(final String [] args)
  {
    String [] newArgs = new String[args.length + 1];
    newArgs[0] = args[0];
    newArgs[1] = NutchwaxBean.class.getName();


    for (int i = 1; i < args.length; i++)
    {
      newArgs[i + 1] = args[i];
    }


    return doClassMain(newArgs);
  }


  protected void doMultiple(final String [] args) throws Exception
  {
    (new Multiple()).run(rewriteArgs(args, 1));
  }
    
  protected void doVersion(final String [] args) throws Exception {
    JobConf job = getJobConf();    
    String collectionType = job.get(Global.COLLECTION_TYPE);    
    System.out.println("Collection type:"+collectionType);
  }  
    
  protected void doJob(final String jobName, final String [] args)
    throws Exception
  {
    if (jobName.equals("import"))
    {
      // Usage: hadoop jar nutchwax.jar import input output name
      if (args.length != 4)
      {
        ImportArcs.doImportUsage(
          "ERROR: Wrong number of arguments passed.", 2);
      }


      final Path input = new Path(args[1]);
      final Path output = new Path(args[2]);
      final String collectionName = args[3];


      checkArcsDir(input);
      OutputDirectories od = new OutputDirectories(output);
      doImport(input, collectionName, od);
    }
    else if (jobName.equals("update"))
    {
      // Usage: hadoop jar nutchwax.jar update output
      if (args.length < 2)
      {
        doUpdateUsage("ERROR: Wrong number of arguments passed.", 2);
      }


      OutputDirectories od = new OutputDirectories(new Path(args[1]));


      if (args.length == 2)
      {
        doUpdate(od);
      }
      else
      {
        for (int i = 2; i < args.length; i++)
        {
          doUpdate(od, new String [] {args[i]});
        }
      }
    }
    else if (jobName.equals("invert"))
    {
      // Usage: hadoop jar nutchwax.jar invert output
      if (args.length < 2)
      {
        doInvertUsage("ERROR: Wrong number of arguments passed.", 2);
      }


      OutputDirectories od = new OutputDirectories(new Path(args[1]));


      if (args.length == 2)
      {
        doInvert(od);
      }
      else
      {
        final int offset = 2;
        Path [] segments = new Path[args.length - offset];


        for (int i = offset; i < args.length; i++)
        {
          Path f = new Path(args[i]);


          if (! getFS().exists(f))
          {
            throw new FileNotFoundException(f.toString());
          }


          segments[i - offset] = f;
        }


        doInvert(od, segments);
      }
    }
    /* TODO MC */
    else if (jobName.equals("pagerank"))
    {
      // Usage: hadoop jar nutchwax.jar pagerank output
      if (args.length != 2)
      {
        doPagerankUsage("ERROR: Wrong number of arguments passed.", 2);
      }


      OutputDirectories od = new OutputDirectories(new Path(args[1]));
      doPagerank(od);      
    }  
    /* TODO MC */
    else if (jobName.equals("index"))
    {
      // Usage: hadoop jar nutchwax.jar index output
      if (args.length < 2)
      {
        doIndexUsage("ERROR: Wrong number of arguments passed.", 2);
      }


      OutputDirectories od = new OutputDirectories(new Path(args[1]));


      if (args.length == 2)
      {
        doIndexing(od);
      }
      else
      {
        final int offset = 2;
        Path [] segments = new Path[args.length - offset];


        for (int i = offset; i < args.length; i++)
        {
          Path f = new Path(args[i]);


          if (! getFS().exists(f))
          {
            throw new FileNotFoundException(f.toString());
          }


          segments[i - offset] = f;
        }


        doIndexing(od, segments);
      }
    }
    else if (jobName.equals("dedup"))
    {
      // Usage: hadoop jar nutchwax.jar dedup output
      if (args.length != 2)
      {
        doDedupUsage("Wrong number of arguments passed.", 2);
      }


      doDedup(new OutputDirectories(new Path(args[1])));
    }
    else if (jobName.equals("merge"))
    {
      // Usage: hadoop jar nutchwax.jar merge output");
      if (args.length != 2)
      {
        doMergeUsage("ERROR: Wrong number of arguments passed.", 2);
      }


      doMerge(new OutputDirectories(new Path(args[1])));
    }
    else if (jobName.equals("all"))
    {
      // Usage: hadoop jar nutchwax.jar import input output name
      if (args.length != 4)
      {
        doAllUsage("ERROR: Wrong number of arguments passed.", 2);
      }


      final Path input = new Path(args[1]);
      final Path output = new Path(args[2]);
      final String collectionName = args[3];


      checkArcsDir(input);


      OutputDirectories od = new OutputDirectories(output);


      doAll(input, collectionName, od);
    }
    else if (jobName.equals("class"))
    {
      if (args.length < 2)
      {
        doClassUsage("ERROR: Wrong number of arguments passed.", 2);
      }


      doClassMain(args);
    }
    else if (jobName.equals("search"))
    {
      if (args.length < 1)
      {
        doClassUsage("ERROR: Wrong number of arguments passed.", 2);
      }


      doSearch(args);
    }
    else if (jobName.equals("multiple"))
    {
      doMultiple(args);
    }
    else if (jobName.equals("version"))
    {
      doVersion(args);
    }    
    else
    {
      usage("ERROR: No handler for job name " + jobName, 4);
      System.exit(0);
    }
  }


 /**
  * Check the arcs dir exists and looks like it has files that list ARCs
  * (rather than ARCs themselves).
  * 
  * @param arcsDir Directory to examine.
  * @throws IOException
  */
  protected void checkArcsDir(final Path arcsDir)
    throws IOException
  {
    if (! getFS().exists(arcsDir))
    {
      throw new IOException(arcsDir + " does not exist.");
    }


    if (! fs.isDirectory(arcsDir))
    {
      throw new IOException(arcsDir + " is not a directory.");
    }


    final Path [] files = getFS().listPaths(arcsDir);


    for (int i = 0; i < files.length; i++)
    {
      if (! getFS().isFile(files[i]))
      {
        throw new IOException(files[i] + " is not a file.");
      }


      if (files[i].getName().toLowerCase().endsWith(".arc.gz"))
      {
        throw new IOException(files[i] + " is an ARC file (ARCSDIR " +
          "should contain text file listing ARCs rather than " +
          "actual ARCs).");
      }
    }
  }


  public static Text generateWaxKey(WritableComparable key,
    final String collection)
  {
    return generateWaxKey(key.toString(), collection);
  }
    
  public static Text generateWaxKey(final String keyStr,
    final String collection)
  {
    if (collection == null)
    {
      throw new NullPointerException("Collection is null for " + keyStr);
    }
    
    if (keyStr == null)
    {
      throw new NullPointerException("keyStr is null");
    }
    
    if (keyStr.startsWith(KEY_COLLECTION_PREFIX))
    {
      LOG.warn("Key already has collection prefix: " + keyStr
        + ". Skipping.");
        
      return new Text(keyStr);
    }


    return new Text(KEY_COLLECTION_PREFIX + collection.trim() +
      KEY_COLLECTION_SUFFIX + keyStr.trim());
  }


  public static String getCollectionFromWaxKey(final WritableComparable key)
    throws IOException
  {
    Matcher m = COLLECTION.matcher(key.toString());
    
    if (m == null || !m.matches())
    {
      throw new IOException("Key doesn't have collection " +
        "prefix <" + key.toString() + ">");
    }
    
    return m.group(1);
  }


  public static String getUrlFromWaxKey(final WritableComparable key)
    throws IOException
  {
    Matcher m = COLLECTION.matcher(key.toString());
    
    if (m == null || !m.matches())
    {
      throw new IOException("Key doesn't have collection " +
        " prefix: " + key);
    }
    
    return m.group(2);
  }
    
  public static long getDate(String d) throws IOException
  {
    long date = 0;
    
    try
    {
      date = ArchiveUtils.getDate(d).getTime();
    }
    catch (final java.text.ParseException e)
    {
      throw new IOException("Failed parse of date: " + d + ": " +
        e.getMessage());
    }
    
    // Date can be < 0 if pre-1970 (Seen in some old ARCs).
    return date >= 0? date: 0;
  }


  public static void usage(final String message, final int exitCode)
  {
    if (message != null && message.length() > 0)
    {
      System.out.println(message);
    }


    System.out.println("Usage: hadoop jar nutchwax.jar <job> [args]");
    System.out.println("Launch NutchWAX job(s) on a hadoop platform.");
    System.out.println("Type 'hadoop jar nutchwax.jar help <job>' for" +
      " help on a specific job.");
    System.out.println("Jobs (usually) must be run in the order " +
      "listed below.");
    System.out.println("Available jobs:");
    System.out.println(" import   Import ARCs.");
    System.out.println(" update   Update dbs with recent imports.");
    System.out.println(" invert   Invert links.");
    System.out.println(" pagerank Compute pagerank.");  // TODO MC
    System.out.println(" index    Index segments.");
    System.out.println(" dedup    Deduplicate by URL or content MD5.");
    System.out.println(" merge    Merge segment indices into one.");
    System.out.println(" all      Runs all above jobs in order.");
    System.out.println(" class    Run the passed class's main.");
    System.out.println(" search   Run a query against index under " +
      "property 'searcher.dir'");    
    System.out.println(" multiple Run multiple concurrent tasks.");
    System.out.println(" version Indicates the software version.");
    
    System.exit(exitCode);
  }
    
  public static void doUpdateUsage(final String message,
    final int exitCode)
  {
    if (message != null && message.length() > 0)
    {
      System.out.println(message);
    }
    
    System.out.println("Usage: hadoop jar nutchwax.jar update <output> " +
      "[<segments>...]");
    System.out.println("Arguments:");
    System.out.println(" output    Directory to write crawldb under.");
    System.out.println("Options:");
    System.out.println(" segments  List of segments to update crawldb " +
      "with. If none supplied, updates");
    System.out.println("            using latest segment found.");


    System.exit(exitCode);
  }
    
  public static void doInvertUsage(final String message,
    final int exitCode)
  {
    if (message != null && message.length() > 0)
    {
      System.out.println(message);
    }
    
    System.out.println("Usage: hadoop jar nutchwax.jar invert <output> " +
      "[<segments>...]");
    System.out.println("Arguments:");
    System.out.println(" output    Directory to write linkdb under.");
    System.out.println("Options:");
    System.out.println(" segments  List of segments to update linkdb " +
      "with. If none supplied, all under");
    System.out.println("           '<output>/segments/' " +
      "are passed.");


    System.exit(exitCode);
  }
  
  /* TODO MC */
  public static void doPagerankUsage(final String message, 
      final int exitCode)
  {
    if (message != null && message.length() > 0)
    {
      System.out.println(message);
    }


    System.out.println("Usage: hadoop jar nutchwax.jar pagerank <output> ");
    System.out.println("Arguments:");
    System.out.println(" output    Directory to write pagerank under.");    
    System.exit(exitCode);
  }
  /* TODO MC */  
  
  public static void doIndexUsage(final String message,
    final int exitCode)
  {
    if (message != null && message.length() > 0)
    {
      System.out.println(message);
    }
    
    System.out.println("Usage: hadoop jar nutchwax.jar index <output> " +
      "[<segments>...]");
    System.out.println("Arguments:");
    System.out.println(" output    Directory to write indexes under.");
    System.out.println("Options:");
    System.out.println(" segments  List of segments to index. " +
      "If none supplied, all under");
    System.out.println("           '<output>/segments/' " +
      "are indexed.");


    System.exit(exitCode);
  }
    
  public static void doDedupUsage(final String message,
    final int exitCode)
  {
    if (message != null && message.length() > 0)
    {
      System.out.println(message);
    }
    
    System.out.println("Usage: hadoop jar nutchwax.jar dedup <output>");
    System.out.println("Arguments:");
    System.out.println(" output  Directory in which indices" +
      " to dedup reside.");


    System.exit(exitCode);
  }
    
  public static void doMergeUsage(final String message,
    final int exitCode)
  {
    if (message != null && message.length() > 0)
    {
      System.out.println(message);
    }


    System.out.println("Usage: hadoop jar nutchwax.jar merge <output>");
    System.out.println("Arguments:");
    System.out.println(" output  Directory in which indices" +
      " to merge reside.");


    System.exit(exitCode);
  }


  public static void doMultipleUsage(final String message,
    final int exitCode)
  {
    if (message != null && message.length() > 0)
    {
      System.out.println(message);
    }
    
    Multiple.usage();
    
    System.exit(exitCode);
  }
    
  public static void doSearchUsage(final String message,
    final int exitCode)
  {
    if (message != null && message.length() > 0)
    {
      System.out.println(message);
    }
    
    System.out.println("Usage: hadoop jar nutchwax.jar search <query>");
    System.out.println("Arguments:");
    System.out.println(" query  Query string to run against index under " +
      "property 'searcher.dir'");


    System.exit(exitCode);
  }
    
  public static void doAllUsage(final String message, final int exitCode)
  {
    if (message != null && message.length() > 0)
    {
      System.out.println(message);
    }


    System.out.println("Usage: hadoop jar nutchwax.jar import <input>" +
      " <output> <collection>");
    System.out.println("Arguments:");
    System.out.println(" input       Directory of files" +
      " listing ARC URLs to import");
    System.out.println(" output      Directory to import to. Inport is " +
      "written to a subdir named");
    System.out.println("             for current date plus collection " +
      "under '<output>/segments/'");
    System.out.println(" collection  Collection name. Added to" +
      " each resource.");
    
    System.exit(exitCode);
  }
    
  public static void doClassUsage(final String message, final int exitCode)
  {
    if (message != null && message.length() > 0)
    {
      System.out.println(message);
    }
    
    System.out.println("Usage: hadoop jar nutchwax.jar class CLASS ...");
    System.out.println("Arguments:");
    System.out.println(" CLASS    Name of class to run. Invokes main " +
      "passing command-line arguments.");
    System.out.println("          For example, use to run nutch " +
      "commands. Below is list of command");
    System.out.println("          name and implementing class. " +
      "Pass name of class only and emits usage.");
    System.out.println();
    System.out.println("          readdb      " +
      "org.apache.nutch.crawl.CrawlDbReader");
    System.out.println("          mergedb     " +
      "org.apache.nutch.crawl.CrawlDbMerger");
    System.out.println("          readlinkdb  " +
      "org.apache.nutch.crawl.LinkDbReader");
    System.out.println("          segread     " +
      "org.apache.nutch.segment.SegmentReader");
    System.out.println("          mergesegs   " +
      "org.apache.nutch.segment.SegmentMerger");
    System.out.println("          mergelinkdb " +
      "org.apache.nutch.crawl.LinkDbMerger");
    System.exit(exitCode);
  }


  static void doJobHelp(final String jobName)
  {
    if (! JOBS.contains(jobName))
    {
      usage("ERROR: Unknown job " + jobName, 1);
    }


    if (jobName.equals("import"))
    {
      ImportArcs.doImportUsage(null, 1);
    }
    else if (jobName.equals("update"))
    {
      doUpdateUsage(null, 1);
    }
    else if (jobName.equals("invert"))
    {
      doInvertUsage(null, 1);
    }
    /* TODO MC */    
    else if (jobName.equals("pagerank"))
    {
      doPagerankUsage(null, 1);
    }
    /* TODO MC */
    else if (jobName.equals("index"))
    {
      doIndexUsage(null, 1);
    }
    else if (jobName.equals("dedup"))
    {
      doDedupUsage(null, 1);
    }
    else if (jobName.equals("merge"))
    {
      doMergeUsage(null, 1);
    }
    else if (jobName.equals("all"))
    {
      doAllUsage(null, 1);
    }
    else if (jobName.equals("search"))
    {
      doSearchUsage(null, 1);
    }
    else if (jobName.equals("multiple"))
    {
      doMultipleUsage(null, 1);
    }
    else if (jobName.equals("class"))
    {
      doClassUsage(null, 1);
    }
    else
    {
      usage("ERROR: No help for job name " + jobName, 4);
    }
  }


  public static void main(String args[]) throws Exception
  {
    if (args.length < 1)
    {
      usage(null, 0);
      return;
    }


    if (args[0].toLowerCase().equals("help"))
    {
      if (args.length == 1)
      {
        usage("ERROR: Add command you need help on.", 0);
        return;
      }
      
      doJobHelp(args[1].toLowerCase());
    }
        
    final String jobName = args[0].toLowerCase();
    
    if (! JOBS.contains(jobName))
    {
      usage("ERROR: Unknown <job> " + jobName, 1);
    }
        
    Nutchwax ia = new Nutchwax();
    ia.doJob(jobName, args);
  }
}
Source Code of org.archive.access.nutch.Nutchwax$OutputDirectories

Related Classes of org.archive.access.nutch.Nutchwax$OutputDirectories