Examples of NutchJob


Examples of org.apache.nutch.util.NutchJob

  private static JobConf createJob(Configuration config, Path linkDb, boolean normalize, boolean filter) {
    Path newLinkDb =
      new Path("linkdb-" +
               Integer.toString(new Random().nextInt(Integer.MAX_VALUE)));

    JobConf job = new NutchJob(config);
    job.setJobName("linkdb " + linkDb);

    job.setInputFormat(SequenceFileInputFormat.class);

    job.setMapperClass(LinkDb.class);
    // if we don't run the mergeJob, perform normalization/filtering now
    if (normalize || filter) {
      try {
        FileSystem fs = FileSystem.get(config);
        if (!fs.exists(linkDb)) {
          job.setBoolean(LinkDbFilter.URL_FILTERING, filter);
          job.setBoolean(LinkDbFilter.URL_NORMALIZING, normalize);
        }
      } catch (Exception e) {
        LOG.warn("LinkDb createJob: " + e);
      }
    }
    job.setReducerClass(LinkDb.class);

    job.setOutputPath(newLinkDb);
    job.setOutputFormat(MapFileOutputFormat.class);
    job.setBoolean("mapred.output.compress", true);
    job.setOutputKeyClass(Text.class);
    job.setOutputValueClass(Inlinks.class);

    return job;
  }
View Full Code Here

Examples of org.apache.nutch.util.NutchJob

  public static JobConf createMergeJob(Configuration config, Path linkDb, boolean normalize, boolean filter) {
    Path newLinkDb =
      new Path("linkdb-merge-" +
               Integer.toString(new Random().nextInt(Integer.MAX_VALUE)));

    JobConf job = new NutchJob(config);
    job.setJobName("linkdb merge " + linkDb);

    job.setInputFormat(SequenceFileInputFormat.class);

    job.setMapperClass(LinkDbFilter.class);
    job.setBoolean(LinkDbFilter.URL_NORMALIZING, normalize);
    job.setBoolean(LinkDbFilter.URL_FILTERING, filter);
    job.setReducerClass(Merger.class);

    job.setOutputPath(newLinkDb);
    job.setOutputFormat(MapFileOutputFormat.class);
    job.setBoolean("mapred.output.compress", true);
    job.setOutputKeyClass(Text.class);
    job.setOutputValueClass(Inlinks.class);

    return job;
  }
View Full Code Here

Examples of org.apache.nutch.util.NutchJob

    throws IOException {
    Path newCrawlDb =
      new Path(crawlDb,
               Integer.toString(new Random().nextInt(Integer.MAX_VALUE)));

    JobConf job = new NutchJob(config);
    job.setJobName("crawldb " + crawlDb);


    Path current = new Path(crawlDb, CURRENT_NAME);
    if (FileSystem.get(job).exists(current)) {
      job.addInputPath(current);
    }
    job.setInputFormat(SequenceFileInputFormat.class);

    job.setMapperClass(CrawlDbFilter.class);
    job.setReducerClass(CrawlDbReducer.class);

    job.setOutputPath(newCrawlDb);
    job.setOutputFormat(MapFileOutputFormat.class);
    job.setOutputKeyClass(Text.class);
    job.setOutputValueClass(CrawlDatum.class);

    return job;
  }
View Full Code Here

Examples of org.apache.nutch.util.NutchJob

  }

  public static JobConf createMergeJob(Configuration conf, Path output, boolean normalize, boolean filter) {
    Path newCrawlDb = new Path("crawldb-merge-" + Integer.toString(new Random().nextInt(Integer.MAX_VALUE)));

    JobConf job = new NutchJob(conf);
    job.setJobName("crawldb merge " + output);

    job.setInputFormat(SequenceFileInputFormat.class);

    job.setMapperClass(CrawlDbFilter.class);
    job.setBoolean(CrawlDbFilter.URL_FILTERING, filter);
    job.setBoolean(CrawlDbFilter.URL_NORMALIZING, normalize);
    job.setReducerClass(Merger.class);

    FileOutputFormat.setOutputPath(job, newCrawlDb);
    job.setOutputFormat(MapFileOutputFormat.class);
    job.setOutputKeyClass(Text.class);
    job.setOutputValueClass(CrawlDatum.class);

    return job;
  }
View Full Code Here

Examples of org.apache.nutch.util.NutchJob

    Path outlinkDb = new Path(webGraphDb, WebGraph.OUTLINK_DIR);

    // run the inverter job
    Path tempInverted = new Path(webGraphDb, "inverted-"
      + Integer.toString(new Random().nextInt(Integer.MAX_VALUE)));
    JobConf inverter = new NutchJob(conf);
    inverter.setJobName("LinkDumper: inverter");
    FileInputFormat.addInputPath(inverter, nodeDb);
    if (loopsExists) {
      FileInputFormat.addInputPath(inverter, loopSetDb);
    }
    FileInputFormat.addInputPath(inverter, outlinkDb);
    inverter.setInputFormat(SequenceFileInputFormat.class);
    inverter.setMapperClass(Inverter.class);
    inverter.setReducerClass(Inverter.class);
    inverter.setMapOutputKeyClass(Text.class);
    inverter.setMapOutputValueClass(ObjectWritable.class);
    inverter.setOutputKeyClass(Text.class);
    inverter.setOutputValueClass(LinkNode.class);
    FileOutputFormat.setOutputPath(inverter, tempInverted);
    inverter.setOutputFormat(SequenceFileOutputFormat.class);

    try {
      LOG.info("LinkDumper: running inverter");
      JobClient.runJob(inverter);
      LOG.info("LinkDumper: finished inverter");
    }
    catch (IOException e) {
      LOG.error(StringUtils.stringifyException(e));
      throw e;
    }

    // run the merger job
    JobConf merger = new NutchJob(conf);
    merger.setJobName("LinkDumper: merger");
    FileInputFormat.addInputPath(merger, tempInverted);
    merger.setInputFormat(SequenceFileInputFormat.class);
    merger.setReducerClass(Merger.class);
    merger.setMapOutputKeyClass(Text.class);
    merger.setMapOutputValueClass(LinkNode.class);
    merger.setOutputKeyClass(Text.class);
    merger.setOutputValueClass(LinkNodes.class);
    FileOutputFormat.setOutputPath(merger, linkdump);
    merger.setOutputFormat(MapFileOutputFormat.class);

    try {
      LOG.info("LinkDumper: running merger");
      JobClient.runJob(merger);
      LOG.info("LinkDumper: finished merger");
View Full Code Here

Examples of org.apache.nutch.util.NutchJob

    // set CRAWLDB_PURGE_404 to true
    conf.setBoolean(CrawlDb.CRAWLDB_PURGE_404,true);
    conf.setBoolean(CrawlDbFilter.URL_NORMALIZING,true);
    conf.setBoolean(CrawlDbFilter.URL_FILTERING,false);
    conf.setInt("urlnormalizer.loop.count", 2);
    JobConf job = new NutchJob(conf);
    job.setJobName("Test CrawlDbFilter");
    Path current = new Path(dbDir, "current");
    if (FileSystem.get(job).exists(current)) {
      FileInputFormat.addInputPath(job, current);
    }
    job.setInputFormat(SequenceFileInputFormat.class);
    job.setMapperClass(CrawlDbFilter.class);
    job.setReducerClass(CrawlDbReducer.class);
    FileOutputFormat.setOutputPath(job, newCrawlDb);
    job.setOutputFormat(MapFileOutputFormat.class);
    job.setOutputKeyClass(Text.class);
    job.setOutputValueClass(CrawlDatum.class);
    JobClient.runJob(job);

    Path fetchlist = new Path(new Path(newCrawlDb,
      "part-00000"), "data");
View Full Code Here

Examples of org.apache.nutch.util.NutchJob

    SimpleDateFormat sdf = new SimpleDateFormat("yyyy-MM-dd HH:mm:ss");
    long start = System.currentTimeMillis();
    LOG.info("FreeGenerator: starting at " + sdf.format(start));

    JobConf job = new NutchJob(getConf());
    job.setBoolean(FILTER_KEY, filter);
    job.setBoolean(NORMALIZE_KEY, normalize);
    FileInputFormat.addInputPath(job, new Path(args[0]));
    job.setInputFormat(TextInputFormat.class);
    job.setMapperClass(FG.class);
    job.setMapOutputKeyClass(Text.class);
    job.setMapOutputValueClass(Generator.SelectorEntry.class);
    job.setPartitionerClass(URLPartitioner.class);
    job.setReducerClass(FG.class);
    String segName = Generator.generateSegmentName();
    job.setNumReduceTasks(job.getNumMapTasks());
    job.setOutputFormat(SequenceFileOutputFormat.class);
    job.setOutputKeyClass(Text.class);
    job.setOutputValueClass(CrawlDatum.class);
    job.setOutputKeyComparatorClass(Generator.HashComparator.class);
    FileOutputFormat.setOutputPath(job, new Path(args[1],
        new Path(segName, CrawlDatum.GENERATE_DIR_NAME)));
    try {
      JobClient.runJob(job);
    } catch (Exception e) {
View Full Code Here

Examples of org.apache.nutch.util.NutchJob

  public void merge(Path out, Path[] segs, boolean filter, boolean normalize, long slice) throws Exception {
    String segmentName = Generator.generateSegmentName();
    if (LOG.isInfoEnabled()) {
      LOG.info("Merging " + segs.length + " segments to " + out + "/" + segmentName);
    }
    JobConf job = new NutchJob(getConf());
    job.setJobName("mergesegs " + out + "/" + segmentName);
    job.setBoolean("segment.merger.filter", filter);
    job.setBoolean("segment.merger.normalizer", normalize);
    job.setLong("segment.merger.slice", slice);
    job.set("segment.merger.segmentName", segmentName);
    FileSystem fs = FileSystem.get(getConf());
    // prepare the minimal common set of input dirs
    boolean g = true;
    boolean f = true;
    boolean p = true;
    boolean c = true;
    boolean pd = true;
    boolean pt = true;
    for (int i = 0; i < segs.length; i++) {
      if (!fs.exists(segs[i])) {
        if (LOG.isWarnEnabled()) {
          LOG.warn("Input dir " + segs[i] + " doesn't exist, skipping.");
        }
        segs[i] = null;
        continue;
      }
      if (LOG.isInfoEnabled()) {
        LOG.info("SegmentMerger:   adding " + segs[i]);
      }
      Path cDir = new Path(segs[i], Content.DIR_NAME);
      Path gDir = new Path(segs[i], CrawlDatum.GENERATE_DIR_NAME);
      Path fDir = new Path(segs[i], CrawlDatum.FETCH_DIR_NAME);
      Path pDir = new Path(segs[i], CrawlDatum.PARSE_DIR_NAME);
      Path pdDir = new Path(segs[i], ParseData.DIR_NAME);
      Path ptDir = new Path(segs[i], ParseText.DIR_NAME);
      c = c && fs.exists(cDir);
      g = g && fs.exists(gDir);
      f = f && fs.exists(fDir);
      p = p && fs.exists(pDir);
      pd = pd && fs.exists(pdDir);
      pt = pt && fs.exists(ptDir);
    }
    StringBuffer sb = new StringBuffer();
    if (c) sb.append(" " + Content.DIR_NAME);
    if (g) sb.append(" " + CrawlDatum.GENERATE_DIR_NAME);
    if (f) sb.append(" " + CrawlDatum.FETCH_DIR_NAME);
    if (p) sb.append(" " + CrawlDatum.PARSE_DIR_NAME);
    if (pd) sb.append(" " + ParseData.DIR_NAME);
    if (pt) sb.append(" " + ParseText.DIR_NAME);
    if (LOG.isInfoEnabled()) {
      LOG.info("SegmentMerger: using segment data from:" + sb.toString());
    }
    for (int i = 0; i < segs.length; i++) {
      if (segs[i] == null) continue;
      if (g) {
        Path gDir = new Path(segs[i], CrawlDatum.GENERATE_DIR_NAME);
        FileInputFormat.addInputPath(job, gDir);
      }
      if (c) {
        Path cDir = new Path(segs[i], Content.DIR_NAME);
        FileInputFormat.addInputPath(job, cDir);
      }
      if (f) {
        Path fDir = new Path(segs[i], CrawlDatum.FETCH_DIR_NAME);
        FileInputFormat.addInputPath(job, fDir);
      }
      if (p) {
        Path pDir = new Path(segs[i], CrawlDatum.PARSE_DIR_NAME);
        FileInputFormat.addInputPath(job, pDir);
      }
      if (pd) {
        Path pdDir = new Path(segs[i], ParseData.DIR_NAME);
        FileInputFormat.addInputPath(job, pdDir);
      }
      if (pt) {
        Path ptDir = new Path(segs[i], ParseText.DIR_NAME);
        FileInputFormat.addInputPath(job, ptDir);
      }
    }
    job.setInputFormat(ObjectInputFormat.class);
    job.setMapperClass(SegmentMerger.class);
    job.setReducerClass(SegmentMerger.class);
    FileOutputFormat.setOutputPath(job, out);
    job.setOutputKeyClass(Text.class);
    job.setOutputValueClass(MetaWrapper.class);
    job.setOutputFormat(SegmentOutputFormat.class);
   
    setConf(job);
   
    JobClient.runJob(job);
  }
View Full Code Here

Examples of org.apache.nutch.util.NutchJob

      LOG.info("CrawlDb statistics start: " + crawlDb);
    }

    Path tmpFolder = new Path(crawlDb, "stat_tmp" + System.currentTimeMillis());

    JobConf job = new NutchJob(config);
    job.setJobName("stats " + crawlDb);
    job.setBoolean("db.reader.stats.sort", sort);

    FileInputFormat.addInputPath(job, new Path(crawlDb, CrawlDb.CURRENT_NAME));
    job.setInputFormat(SequenceFileInputFormat.class);

    job.setMapperClass(CrawlDbStatMapper.class);
    job.setCombinerClass(CrawlDbStatCombiner.class);
    job.setReducerClass(CrawlDbStatReducer.class);

    FileOutputFormat.setOutputPath(job, tmpFolder);
    job.setOutputFormat(SequenceFileOutputFormat.class);
    job.setOutputKeyClass(Text.class);
    job.setOutputValueClass(LongWritable.class);

    // https://issues.apache.org/jira/browse/NUTCH-1029
    job.setBoolean("mapreduce.fileoutputcommitter.marksuccessfuljobs", false);

    JobClient.runJob(job);

    // reading the result
    FileSystem fileSystem = FileSystem.get(config);
View Full Code Here

Examples of org.apache.nutch.util.NutchJob

      LOG.info("CrawlDb db: " + crawlDb);
    }

    Path outFolder = new Path(output);

    JobConf job = new NutchJob(config);
    job.setJobName("dump " + crawlDb);

    FileInputFormat.addInputPath(job, new Path(crawlDb, CrawlDb.CURRENT_NAME));
    job.setInputFormat(SequenceFileInputFormat.class);
    FileOutputFormat.setOutputPath(job, outFolder);

    if (format.equals("csv")) {
      job.setOutputFormat(CrawlDatumCsvOutputFormat.class);
    }
    else if (format.equals("crawldb")) {
      job.setOutputFormat(MapFileOutputFormat.class);
    } else {
      job.setOutputFormat(TextOutputFormat.class);
    }

    if (status != null) job.set("status", status);
    if (regex != null) job.set("regex", regex);
    if (retry != null) job.setInt("retry", retry);
   
    job.setMapperClass(CrawlDbDumpMapper.class);
    job.setOutputKeyClass(Text.class);
    job.setOutputValueClass(CrawlDatum.class);

    JobClient.runJob(job);
    if (LOG.isInfoEnabled()) { LOG.info("CrawlDb dump: done"); }
  }
View Full Code Here
TOP
Copyright © 2018 www.massapi.com. All rights reserved.
All source code are property of their respective owners. Java is a trademark of Sun Microsystems, Inc and owned by ORACLE Inc. Contact coftware#gmail.com.