Source Code of org.apache.nutch.crawl.CrawlDbReader$CrawlDbTopNReducer

/**
 * Licensed to the Apache Software Foundation (ASF) under one or more
 * contributor license agreements.  See the NOTICE file distributed with
 * this work for additional information regarding copyright ownership.
 * The ASF licenses this file to You under the Apache License, Version 2.0
 * (the "License"); you may not use this file except in compliance with
 * the License.  You may obtain a copy of the License at
 *
 *     http://www.apache.org/licenses/LICENSE-2.0
 *
 * Unless required by applicable law or agreed to in writing, software
 * distributed under the License is distributed on an "AS IS" BASIS,
 * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 * See the License for the specific language governing permissions and
 * limitations under the License.
 */


package org.apache.nutch.crawl;


import java.io.IOException;
import java.util.Iterator;
import java.util.Random;
import java.util.TreeMap;


// Commons Logging imports
import org.apache.commons.logging.Log;
import org.apache.commons.logging.LogFactory;


import org.apache.hadoop.conf.Configuration;
import org.apache.hadoop.fs.FileSystem;
import org.apache.hadoop.fs.Path;
import org.apache.hadoop.io.Closeable;
import org.apache.hadoop.io.FloatWritable;
import org.apache.hadoop.io.LongWritable;
import org.apache.hadoop.io.MapFile;
import org.apache.hadoop.io.SequenceFile;
import org.apache.hadoop.io.Text;
import org.apache.hadoop.io.Writable;
import org.apache.hadoop.io.WritableComparable;
import org.apache.hadoop.mapred.JobClient;
import org.apache.hadoop.mapred.JobConf;
import org.apache.hadoop.mapred.MapFileOutputFormat;
import org.apache.hadoop.mapred.Mapper;
import org.apache.hadoop.mapred.OutputCollector;
import org.apache.hadoop.mapred.Reducer;
import org.apache.hadoop.mapred.Reporter;
import org.apache.hadoop.mapred.SequenceFileInputFormat;
import org.apache.hadoop.mapred.SequenceFileOutputFormat;
import org.apache.hadoop.mapred.TextOutputFormat;
import org.apache.hadoop.mapred.lib.HashPartitioner;
import org.apache.hadoop.mapred.lib.IdentityMapper;
import org.apache.hadoop.mapred.lib.IdentityReducer;
import org.apache.nutch.util.NutchConfiguration;
import org.apache.nutch.util.NutchJob;


/**
 * Read utility for the CrawlDB.
 * 
 * @author Andrzej Bialecki
 * 
 */
public class CrawlDbReader implements Closeable {


  public static final Log LOG = LogFactory.getLog(CrawlDbReader.class);
  
  private MapFile.Reader[] readers = null;
  
  private void openReaders(String crawlDb, Configuration config) throws IOException {
    if (readers != null) return;
    FileSystem fs = FileSystem.get(config);
    readers = MapFileOutputFormat.getReaders(fs, new Path(crawlDb, CrawlDb.CURRENT_NAME), config);
  }
  
  private void closeReaders() {
    if (readers == null) return;
    for (int i = 0; i < readers.length; i++) {
      try {
        readers[i].close();
      } catch (Exception e) {
        
      }
    }
  }


  public static class CrawlDbStatMapper implements Mapper {
    LongWritable COUNT_1 = new LongWritable(1);
    public void configure(JobConf job) {}
    public void close() {}
    public void map(WritableComparable key, Writable value, OutputCollector output, Reporter reporter)
            throws IOException {
      CrawlDatum cd = (CrawlDatum) value;
      output.collect(new Text("T"), COUNT_1);
      output.collect(new Text("status " + cd.getStatus()), COUNT_1);
      output.collect(new Text("retry " + cd.getRetriesSinceFetch()), COUNT_1);
      output.collect(new Text("s"), new LongWritable((long) (cd.getScore() * 1000.0)));
    }
  }
  
  public static class CrawlDbStatCombiner implements Reducer {
    LongWritable val = new LongWritable();
    
    public CrawlDbStatCombiner() { }
    public void configure(JobConf job) { }
    public void close() {}
    public void reduce(WritableComparable key, Iterator values, OutputCollector output, Reporter reporter)
        throws IOException {
      val.set(0L);
      String k = ((Text)key).toString();
      if (!k.equals("s")) {
        while (values.hasNext()) {
          LongWritable cnt = (LongWritable)values.next();
          val.set(val.get() + cnt.get());
        }
        output.collect(key, val);
      } else {
        long total = 0;
        long min = Long.MAX_VALUE;
        long max = Long.MIN_VALUE;
        while (values.hasNext()) {
          LongWritable cnt = (LongWritable)values.next();
          if (cnt.get() < min) min = cnt.get();
          if (cnt.get() > max) max = cnt.get();
          total += cnt.get();
        }
        output.collect(new Text("scn"), new LongWritable(min));
        output.collect(new Text("scx"), new LongWritable(max));
        output.collect(new Text("sct"), new LongWritable(total));
      }
    }
  }


  public static class CrawlDbStatReducer implements Reducer {
    public void configure(JobConf job) {}
    public void close() {}
    public void reduce(WritableComparable key, Iterator values, OutputCollector output, Reporter reporter)
            throws IOException {


      String k = ((Text) key).toString();
      if (k.equals("T")) {
        // sum all values for this key
        long sum = 0;
        while (values.hasNext()) {
          sum += ((LongWritable) values.next()).get();
        }
        // output sum
        output.collect(key, new LongWritable(sum));
      } else if (k.startsWith("status") || k.startsWith("retry")) {
        LongWritable cnt = new LongWritable();
        while (values.hasNext()) {
          LongWritable val = (LongWritable)values.next();
          cnt.set(cnt.get() + val.get());
        }
        output.collect(key, cnt);
      } else if (k.equals("scx")) {
        LongWritable cnt = new LongWritable(Long.MIN_VALUE);
        while (values.hasNext()) {
          LongWritable val = (LongWritable)values.next();
          if (cnt.get() < val.get()) cnt.set(val.get());
        }
        output.collect(key, cnt);
      } else if (k.equals("scn")) {
        LongWritable cnt = new LongWritable(Long.MAX_VALUE);
        while (values.hasNext()) {
          LongWritable val = (LongWritable)values.next();
          if (cnt.get() > val.get()) cnt.set(val.get());
        }
        output.collect(key, cnt);
      } else if (k.equals("sct")) {
        LongWritable cnt = new LongWritable();
        while (values.hasNext()) {
          LongWritable val = (LongWritable)values.next();
          cnt.set(cnt.get() + val.get());
        }
        output.collect(key, cnt);
      }
    }
  }


  public static class CrawlDbDumpReducer implements Reducer {


    public void reduce(WritableComparable key, Iterator values, OutputCollector output, Reporter reporter) throws IOException {
      while (values.hasNext()) {
        output.collect(key, (Writable)values.next());
      }
    }


    public void configure(JobConf job) {}
    public void close() {}
  }
  
  public static class CrawlDbTopNMapper implements Mapper {
    private static final FloatWritable fw = new FloatWritable();
    private float min = 0.0f;
    
    public void configure(JobConf job) {
      long lmin = job.getLong("CrawlDbReader.topN.min", 0);
      if (lmin != 0) {
        min = (float)lmin / 1000000.0f;
      }
    }
    public void close() {}
    public void map(WritableComparable key, Writable value, OutputCollector output, Reporter reporter)
            throws IOException {
      CrawlDatum datum = (CrawlDatum)value;
      if (datum.getScore() < min) return; // don't collect low-scoring records
      fw.set(-datum.getScore()); // reverse sorting order
      output.collect(fw, key); // invert mapping: score -> url
    }
  }
  
  public static class CrawlDbTopNReducer implements Reducer {
    private long topN;
    private long count = 0L;
    
    public void reduce(WritableComparable key, Iterator values, OutputCollector output, Reporter reporter) throws IOException {
      while (values.hasNext() && count < topN) {
        FloatWritable fw = (FloatWritable)key;
        fw.set(-fw.get());
        output.collect(fw, (Writable)values.next());
        count++;
      }
    }


    public void configure(JobConf job) {
      topN = job.getLong("CrawlDbReader.topN", 100) / job.getNumReduceTasks();
    }
    
    public void close() {}
  }


  public void close() {
    closeReaders();
  }
  
  public void processStatJob(String crawlDb, Configuration config) throws IOException {


    if (LOG.isInfoEnabled()) {
      LOG.info("CrawlDb statistics start: " + crawlDb);
    }
    
    Path tmpFolder = new Path(crawlDb, "stat_tmp" + System.currentTimeMillis());


    JobConf job = new NutchJob(config);
    job.setJobName("stats " + crawlDb);


    job.addInputPath(new Path(crawlDb, CrawlDb.CURRENT_NAME));
    job.setInputFormat(SequenceFileInputFormat.class);


    job.setMapperClass(CrawlDbStatMapper.class);
    job.setCombinerClass(CrawlDbStatCombiner.class);
    job.setReducerClass(CrawlDbStatReducer.class);


    job.setOutputPath(tmpFolder);
    job.setOutputFormat(SequenceFileOutputFormat.class);
    job.setOutputKeyClass(Text.class);
    job.setOutputValueClass(LongWritable.class);


    JobClient.runJob(job);


    // reading the result
    FileSystem fileSystem = FileSystem.get(config);
    SequenceFile.Reader[] readers = SequenceFileOutputFormat.getReaders(config, tmpFolder);


    Text key = new Text();
    LongWritable value = new LongWritable();


    TreeMap stats = new TreeMap();
    for (int i = 0; i < readers.length; i++) {
      SequenceFile.Reader reader = readers[i];
      while (reader.next(key, value)) {
        String k = key.toString();
        LongWritable val = (LongWritable) stats.get(k);
        if (val == null) {
          val = new LongWritable();
          if (k.equals("scx")) val.set(Long.MIN_VALUE);
          if (k.equals("scn")) val.set(Long.MAX_VALUE);
          stats.put(k, val);
        }
        if (k.equals("scx")) {
          if (val.get() < value.get()) val.set(value.get());
        } else if (k.equals("scn")) {
          if (val.get() > value.get()) val.set(value.get());          
        } else {
          val.set(val.get() + value.get());
        }
      }
      reader.close();
    }
    
    if (LOG.isInfoEnabled()) {
      LOG.info("Statistics for CrawlDb: " + crawlDb);
      LongWritable totalCnt = (LongWritable)stats.get("T");
      stats.remove("T");
      LOG.info("TOTAL urls:\t" + totalCnt.get());
      Iterator it = stats.keySet().iterator();
      while (it.hasNext()) {
        String k = (String) it.next();
        LongWritable val = (LongWritable) stats.get(k);
        if (k.equals("scn")) {
          LOG.info("min score:\t" + (float) (val.get() / 1000.0f));
        } else if (k.equals("scx")) {
          LOG.info("max score:\t" + (float) (val.get() / 1000.0f));
        } else if (k.equals("sct")) {
          LOG.info("avg score:\t" + (float) ((double) (val.get() / totalCnt.get()) / 1000.0));
        } else if (k.startsWith("status")) {
          int code = Integer.parseInt(k.substring(k.indexOf(' ') + 1));
          LOG.info(k + " (" + CrawlDatum.getStatusName((byte)code) + "):\t" + val);
        } else LOG.info(k + ":\t" + val);
      }
    }
    // removing the tmp folder
    fileSystem.delete(tmpFolder);
    if (LOG.isInfoEnabled()) { LOG.info("CrawlDb statistics: done"); }


  }
  
  public CrawlDatum get(String crawlDb, String url, Configuration config) throws IOException {
    Text key = new Text(url);
    CrawlDatum val = new CrawlDatum();
    openReaders(crawlDb, config);
    CrawlDatum res = (CrawlDatum)MapFileOutputFormat.getEntry(readers, new HashPartitioner(), key, val);
    return res;
  }


  public void readUrl(String crawlDb, String url, Configuration config) throws IOException {
    CrawlDatum res = get(crawlDb, url, config);
    System.out.println("URL: " + url);
    if (res != null) {
      System.out.println(res);
    } else {
      System.out.println("not found");
    }
  }
  
  public void processDumpJob(String crawlDb, String output, Configuration config) throws IOException {


    if (LOG.isInfoEnabled()) {
      LOG.info("CrawlDb dump: starting");
      LOG.info("CrawlDb db: " + crawlDb);
    }
    
    Path outFolder = new Path(output);


    JobConf job = new NutchJob(config);
    job.setJobName("dump " + crawlDb);


    job.addInputPath(new Path(crawlDb, CrawlDb.CURRENT_NAME));
    job.setInputFormat(SequenceFileInputFormat.class);


    job.setOutputPath(outFolder);
    job.setOutputFormat(TextOutputFormat.class);
    job.setOutputKeyClass(Text.class);
    job.setOutputValueClass(CrawlDatum.class);


    JobClient.runJob(job);
    if (LOG.isInfoEnabled()) { LOG.info("CrawlDb dump: done"); }
  }


  public void processTopNJob(String crawlDb, long topN, float min, String output, Configuration config) throws IOException {
    
    if (LOG.isInfoEnabled()) {
      LOG.info("CrawlDb topN: starting (topN=" + topN + ", min=" + min + ")");
      LOG.info("CrawlDb db: " + crawlDb);
    }
    
    Path outFolder = new Path(output);
    Path tempDir =
      new Path(config.get("mapred.temp.dir", ".") +
               "/readdb-topN-temp-"+
               Integer.toString(new Random().nextInt(Integer.MAX_VALUE)));


    JobConf job = new NutchJob(config);
    job.setJobName("topN prepare " + crawlDb);
    job.addInputPath(new Path(crawlDb, CrawlDb.CURRENT_NAME));
    job.setInputFormat(SequenceFileInputFormat.class);
    job.setMapperClass(CrawlDbTopNMapper.class);
    job.setReducerClass(IdentityReducer.class);


    job.setOutputPath(tempDir);
    job.setOutputFormat(SequenceFileOutputFormat.class);
    job.setOutputKeyClass(FloatWritable.class);
    job.setOutputValueClass(Text.class);


    // XXX hmmm, no setFloat() in the API ... :(
    job.setLong("CrawlDbReader.topN.min", Math.round(1000000.0 * min));
    JobClient.runJob(job); 
    
    if (LOG.isInfoEnabled()) {
      LOG.info("CrawlDb topN: collecting topN scores.");
    }
    job = new NutchJob(config);
    job.setJobName("topN collect " + crawlDb);
    job.setLong("CrawlDbReader.topN", topN);


    job.addInputPath(tempDir);
    job.setInputFormat(SequenceFileInputFormat.class);
    job.setMapperClass(IdentityMapper.class);
    job.setReducerClass(CrawlDbTopNReducer.class);


    job.setOutputPath(outFolder);
    job.setOutputFormat(TextOutputFormat.class);
    job.setOutputKeyClass(FloatWritable.class);
    job.setOutputValueClass(Text.class);


    // XXX *sigh* this apparently doesn't work ... :-((
    job.setNumReduceTasks(1); // create a single file.
    
    JobClient.runJob(job);
    FileSystem fs = FileSystem.get(config);
    fs.delete(tempDir);
    if (LOG.isInfoEnabled()) { LOG.info("CrawlDb topN: done"); }


  }


  public static void main(String[] args) throws IOException {
    CrawlDbReader dbr = new CrawlDbReader();


    if (args.length < 1) {
      System.err.println("Usage: CrawlDbReader <crawldb> (-stats | -dump <out_dir> | -topN <nnnn> <out_dir> [<min>] | -url <url>)");
      System.err.println("\t<crawldb>\tdirectory name where crawldb is located");
      System.err.println("\t-stats\tprint overall statistics to System.out");
      System.err.println("\t-dump <out_dir>\tdump the whole db to a text file in <out_dir>");
      System.err.println("\t-url <url>\tprint information on <url> to System.out");
      System.err.println("\t-topN <nnnn> <out_dir> [<min>]\tdump top <nnnn> urls sorted by score to <out_dir>");
      System.err.println("\t\t[<min>]\tskip records with scores below this value.");
      System.err.println("\t\t\tThis can significantly improve performance.");
      return;
    }
    String param = null;
    String crawlDb = args[0];
    Configuration conf = NutchConfiguration.create();
    for (int i = 1; i < args.length; i++) {
      if (args[i].equals("-stats")) {
        dbr.processStatJob(crawlDb, conf);
      } else if (args[i].equals("-dump")) {
        param = args[++i];
        dbr.processDumpJob(crawlDb, param, conf);
      } else if (args[i].equals("-url")) {
        param = args[++i];
        dbr.readUrl(crawlDb, param, conf);
      } else if (args[i].equals("-topN")) {
        param = args[++i];
        long topN = Long.parseLong(param);
        param = args[++i];
        float min = 0.0f;
        if (i < args.length - 1) {
          min = Float.parseFloat(args[++i]);
        }
        dbr.processTopNJob(crawlDb, topN, min, param, conf);
      } else {
        System.err.println("\nError: wrong argument " + args[i]);
      }
    }
    return;
  }
}
Source Code of org.apache.nutch.crawl.CrawlDbReader$CrawlDbTopNReducer

Related Classes of org.apache.nutch.crawl.CrawlDbReader$CrawlDbTopNReducer