Source Code of org.apache.nutch.tools.Benchmark$BenchmarkResults

/**
 * Licensed to the Apache Software Foundation (ASF) under one or more
 * contributor license agreements.  See the NOTICE file distributed with
 * this work for additional information regarding copyright ownership.
 * The ASF licenses this file to You under the Apache License, Version 2.0
 * (the "License"); you may not use this file except in compliance with
 * the License.  You may obtain a copy of the License at
 *
 *     http://www.apache.org/licenses/LICENSE-2.0
 *
 * Unless required by applicable law or agreed to in writing, software
 * distributed under the License is distributed on an "AS IS" BASIS,
 * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 * See the License for the specific language governing permissions and
 * limitations under the License.
 */


package org.apache.nutch.tools;


import java.io.OutputStream;
import java.text.SimpleDateFormat;
import java.util.ArrayList;
import java.util.Date;
import java.util.HashMap;
import java.util.List;
import java.util.Map;


import org.apache.commons.logging.Log;
import org.apache.commons.logging.LogFactory;
import org.apache.hadoop.conf.Configuration;
import org.apache.hadoop.conf.Configured;
import org.apache.hadoop.fs.FileSystem;
import org.apache.hadoop.fs.Path;
import org.apache.hadoop.util.Tool;
import org.apache.hadoop.mapred.JobConf;
import org.apache.hadoop.util.ToolRunner;
import org.apache.nutch.crawl.CrawlDb;
import org.apache.nutch.crawl.CrawlDbReader;
import org.apache.nutch.crawl.Generator;
import org.apache.nutch.crawl.Injector;
import org.apache.nutch.crawl.LinkDb;
import org.apache.nutch.fetcher.Fetcher;
import org.apache.nutch.parse.ParseSegment;
import org.apache.nutch.util.NutchConfiguration;
import org.apache.nutch.util.NutchJob;


public class Benchmark extends Configured implements Tool {
  private static final Log LOG = LogFactory.getLog(Benchmark.class);


  public static void main(String[] args) throws Exception {
    Configuration conf = NutchConfiguration.create();
    int res = ToolRunner.run(conf, new Benchmark(), args);
    System.exit(res);
  }
  
  private static String getDate() {
    return new SimpleDateFormat("yyyyMMddHHmmss").format
      (new Date(System.currentTimeMillis()));
  }
 
  private void createSeeds(FileSystem fs, Path seedsDir, int count) throws Exception {
    OutputStream os = fs.create(new Path(seedsDir, "seeds"));
    for (int i = 0; i < count; i++) {
      String url = "http://www.test-" + i + ".com/\r\n";
      os.write(url.getBytes());
    }
    os.flush();
    os.close();
  }
  
  public static final class BenchmarkResults {
    Map<String,Map<String,Long>> timings = new HashMap<String,Map<String,Long>>();
    List<String> runs = new ArrayList<String>();
    List<String> stages = new ArrayList<String>();
    int seeds, depth, threads;
    boolean delete;
    long topN;
    long elapsed;
    String plugins;
    
    public void addTiming(String stage, String run, long timing) {
      if (!runs.contains(run)) {
        runs.add(run);
      }
      if (!stages.contains(stage)) {
        stages.add(stage);
      }
      Map<String,Long> t = timings.get(stage);
      if (t == null) {
        t = new HashMap<String,Long>();
        timings.put(stage, t);
      }
      t.put(run, timing);
    }
    
    public String toString() {
      StringBuilder sb = new StringBuilder();
      sb.append("* Plugins:\t" + plugins + "\n");
      sb.append("* Seeds:\t" + seeds + "\n");
      sb.append("* Depth:\t" + depth + "\n");
      sb.append("* Threads:\t" + threads + "\n");
      sb.append("* TopN:\t" + topN + "\n");
      sb.append("* Delete:\t" + delete + "\n");
      sb.append("* TOTAL ELAPSED:\t" + elapsed + "\n");
      for (String stage : stages) {
        Map<String,Long> timing = timings.get(stage);
        if (timing == null) continue;
        sb.append("- stage: " + stage + "\n");
        for (String r : runs) {
          Long Time = timing.get(r);
          if (Time == null) {
            continue;
          }
          sb.append("\trun " + r + "\t" + Time + "\n");
        }
      }
      return sb.toString();
    }
    
    public List<String> getStages() {
      return stages;
    }
    public List<String> getRuns() {
      return runs;
    }
  }
  
  public int run(String[] args) throws Exception {
    String plugins = "protocol-http|parse-tika|scoring-opic|urlfilter-regex|urlnormalizer-pass";
    int seeds = 1;
    int depth = 10;
    int threads = 10;
    boolean delete = true;
    long topN = Long.MAX_VALUE;
    
    if (args.length == 0) {
      System.err.println("Usage: Benchmark [-seeds NN] [-depth NN] [-threads NN] [-keep] [-maxPerHost NN] [-plugins <regex>]");
      System.err.println("\t-seeds NN\tcreate NN unique hosts in a seed list (default: 1)");
      System.err.println("\t-depth NN\tperform NN crawl cycles (default: 10)");
      System.err.println("\t-threads NN\tuse NN threads per Fetcher task (default: 10)");
      System.err.println("\t-keep\tkeep segment data (default: delete after updatedb)");
      System.err.println("\t-plugins <regex>\toverride 'plugin.includes'.");
      System.err.println("\tNOTE: if not specified, this is reset to: " + plugins);
      System.err.println("\tNOTE: if 'default' is specified then a value set in nutch-default/nutch-site is used.");
      System.err.println("\t-maxPerHost NN\tmax. # of URLs per host in a fetchlist");
      return -1;
    }
    int maxPerHost = Integer.MAX_VALUE;
    for (int i = 0; i < args.length; i++) {
      if (args[i].equals("-seeds")) {
        seeds = Integer.parseInt(args[++i]);
      } else if (args[i].equals("-threads")) {
        threads = Integer.parseInt(args[++i]);
      } else if (args[i].equals("-depth")) {
        depth = Integer.parseInt(args[++i]);
      } else if (args[i].equals("-keep")) {
        delete = false;
      } else if (args[i].equals("-plugins")) {
        plugins = args[++i];
      } else if (args[i].equalsIgnoreCase("-maxPerHost")) {
        maxPerHost = Integer.parseInt(args[++i]);
      } else {
        LOG.fatal("Invalid argument: '" + args[i] + "'");
        return -1;
      }
    }
    BenchmarkResults res = benchmark(seeds, depth, threads, maxPerHost, topN, delete, plugins);
    System.out.println(res);
    return 0;
  }
  
  public BenchmarkResults benchmark(int seeds, int depth, int threads, int maxPerHost,
        long topN, boolean delete, String plugins) throws Exception {
    Configuration conf = getConf();
    conf.set("http.proxy.host", "localhost");
    conf.setInt("http.proxy.port", 8181);
    conf.set("http.agent.name", "test");
    conf.set("http.robots.agents", "test,*");
    if (!plugins.equals("default")) {
      conf.set("plugin.includes", plugins);
    }
    conf.setInt(Generator.GENERATOR_MAX_COUNT, maxPerHost);
    conf.set(Generator.GENERATOR_COUNT_MODE, Generator.GENERATOR_COUNT_VALUE_HOST);
    JobConf job = new NutchJob(getConf());    
    FileSystem fs = FileSystem.get(job);
    Path dir = new Path(getConf().get("hadoop.tmp.dir"),
            "bench-" + System.currentTimeMillis());
    fs.mkdirs(dir);
    Path rootUrlDir = new Path(dir, "seed");
    fs.mkdirs(rootUrlDir);
    createSeeds(fs, rootUrlDir, seeds);


    if (LOG.isInfoEnabled()) {
      LOG.info("crawl started in: " + dir);
      LOG.info("rootUrlDir = " + rootUrlDir);
      LOG.info("threads = " + threads);
      LOG.info("depth = " + depth);      
    }
    BenchmarkResults res = new BenchmarkResults();
    res.delete = delete;
    res.depth = depth;
    res.plugins = plugins;
    res.seeds = seeds;
    res.threads = threads;
    res.topN = topN;
    Path crawlDb = new Path(dir + "/crawldb");
    Path linkDb = new Path(dir + "/linkdb");
    Path segments = new Path(dir + "/segments");
    res.elapsed = System.currentTimeMillis();
    Injector injector = new Injector(getConf());
    Generator generator = new Generator(getConf());
    Fetcher fetcher = new Fetcher(getConf());
    ParseSegment parseSegment = new ParseSegment(getConf());
    CrawlDb crawlDbTool = new CrawlDb(getConf());
    LinkDb linkDbTool = new LinkDb(getConf());
      
    // initialize crawlDb
    long start = System.currentTimeMillis();
    injector.inject(crawlDb, rootUrlDir);
    long delta = System.currentTimeMillis() - start;
    res.addTiming("inject", "0", delta);
    int i;
    for (i = 0; i < depth; i++) {             // generate new segment
      start = System.currentTimeMillis();
      Path[] segs = generator.generate(crawlDb, segments, -1, topN, System
          .currentTimeMillis());
      delta = System.currentTimeMillis() - start;
      res.addTiming("generate", i + "", delta);
      if (segs == null) {
        LOG.info("Stopping at depth=" + i + " - no more URLs to fetch.");
        break;
      }
      start = System.currentTimeMillis();
      fetcher.fetch(segs[0], threads);  // fetch it
      delta = System.currentTimeMillis() - start;
      res.addTiming("fetch", i + "", delta);
      if (!Fetcher.isParsing(job)) {
        start = System.currentTimeMillis();
        parseSegment.parse(segs[0]);    // parse it, if needed
        delta = System.currentTimeMillis() - start;
        res.addTiming("parse", i + "", delta);
      }
      start = System.currentTimeMillis();
      crawlDbTool.update(crawlDb, segs, true, true); // update crawldb
      delta = System.currentTimeMillis() - start;
      res.addTiming("update", i + "", delta);
      start = System.currentTimeMillis();
      linkDbTool.invert(linkDb, segs, true, true, false); // invert links
      delta = System.currentTimeMillis() - start;
      res.addTiming("invert", i + "", delta);
      // delete data
      if (delete) {
        for (Path p : segs) {
          fs.delete(p, true);
        }
      }
    }
    if (i == 0) {
      LOG.warn("No URLs to fetch - check your seed list and URL filters.");
    }
    if (LOG.isInfoEnabled()) { LOG.info("crawl finished: " + dir); }
    res.elapsed = System.currentTimeMillis() - res.elapsed;
    CrawlDbReader dbreader = new CrawlDbReader();
    dbreader.processStatJob(crawlDb.toString(), conf, false);
    return res;
  }


}
Source Code of org.apache.nutch.tools.Benchmark$BenchmarkResults

Related Classes of org.apache.nutch.tools.Benchmark$BenchmarkResults