/**
* Licensed to the Apache Software Foundation (ASF) under one or more
* contributor license agreements. See the NOTICE file distributed with
* this work for additional information regarding copyright ownership.
* The ASF licenses this file to You under the Apache License, Version 2.0
* (the "License"); you may not use this file except in compliance with
* the License. You may obtain a copy of the License at
*
* http://www.apache.org/licenses/LICENSE-2.0
*
* Unless required by applicable law or agreed to in writing, software
* distributed under the License is distributed on an "AS IS" BASIS,
* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
* See the License for the specific language governing permissions and
* limitations under the License.
*/
package org.apache.nutch.crawl;
import java.io.*;
import java.net.*;
import java.util.*;
import java.text.*;
// Commons Logging imports
import org.apache.commons.logging.Log;
import org.apache.commons.logging.LogFactory;
import org.apache.hadoop.io.*;
import org.apache.hadoop.conf.*;
import org.apache.hadoop.mapred.*;
import org.apache.hadoop.util.StringUtils;
import org.apache.hadoop.util.ToolBase;
import org.apache.hadoop.fs.FileSystem;
import org.apache.hadoop.fs.Path;
import org.apache.nutch.metadata.Nutch;
import org.apache.nutch.net.URLFilterException;
import org.apache.nutch.net.URLFilters;
import org.apache.nutch.net.URLNormalizers;
import org.apache.nutch.scoring.ScoringFilterException;
import org.apache.nutch.scoring.ScoringFilters;
import org.apache.nutch.util.LockUtil;
import org.apache.nutch.util.NutchConfiguration;
import org.apache.nutch.util.NutchJob;
/** Generates a subset of a crawl db to fetch. */
public class Generator extends ToolBase {
public static final String CRAWL_GENERATE_FILTER = "crawl.generate.filter";
public static final String GENERATE_MAX_PER_HOST_BY_IP = "generate.max.per.host.by.ip";
public static final String GENERATE_MAX_PER_HOST = "generate.max.per.host";
public static final String GENERATE_UPDATE_CRAWLDB = "generate.update.crawldb";
public static final String CRAWL_TOP_N = "crawl.topN";
public static final String CRAWL_GEN_CUR_TIME = "crawl.gen.curTime";
public static final String CRAWL_GEN_DELAY = "crawl.gen.delay";
public static final Log LOG = LogFactory.getLog(Generator.class);
public static class SelectorEntry implements Writable {
public Text url;
public CrawlDatum datum;
public SelectorEntry() {
url = new Text();
datum = new CrawlDatum();
}
public void readFields(DataInput in) throws IOException {
url.readFields(in);
datum.readFields(in);
}
public void write(DataOutput out) throws IOException {
url.write(out);
datum.write(out);
}
public String toString() {
return "url=" + url.toString() + ", datum=" + datum.toString();
}
}
/** Selects entries due for fetch. */
public static class Selector implements Mapper, Partitioner, Reducer {
private LongWritable genTime = new LongWritable(System.currentTimeMillis());
private long curTime;
private long limit;
private long count;
private HashMap hostCounts = new HashMap();
private int maxPerHost;
private Partitioner hostPartitioner = new PartitionUrlByHost();
private URLFilters filters;
private URLNormalizers normalizers;
private ScoringFilters scfilters;
private SelectorEntry entry = new SelectorEntry();
private FloatWritable sortValue = new FloatWritable();
private boolean byIP;
private long dnsFailure = 0L;
private boolean filter;
private long genDelay;
private boolean runUpdatedb;
public void configure(JobConf job) {
curTime = job.getLong(CRAWL_GEN_CUR_TIME, System.currentTimeMillis());
limit = job.getLong(CRAWL_TOP_N,Long.MAX_VALUE)/job.getNumReduceTasks();
maxPerHost = job.getInt(GENERATE_MAX_PER_HOST, -1);
byIP = job.getBoolean(GENERATE_MAX_PER_HOST_BY_IP, false);
filters = new URLFilters(job);
normalizers = new URLNormalizers(job, URLNormalizers.SCOPE_GENERATE_HOST_COUNT);
scfilters = new ScoringFilters(job);
hostPartitioner.configure(job);
filter = job.getBoolean(CRAWL_GENERATE_FILTER, true);
genDelay = job.getLong(CRAWL_GEN_DELAY, 7L) * 3600L * 24L * 1000L;
long time = job.getLong(Nutch.GENERATE_TIME_KEY, 0L);
if (time > 0) genTime.set(time);
runUpdatedb = job.getBoolean(GENERATE_UPDATE_CRAWLDB, false);
}
public void close() {}
/** Select & invert subset due for fetch. */
public void map(WritableComparable key, Writable value,
OutputCollector output, Reporter reporter)
throws IOException {
Text url = (Text)key;
if (filter) {
// If filtering is on don't generate URLs that don't pass URLFilters
try {
if (filters.filter(url.toString()) == null)
return;
} catch (URLFilterException e) {
if (LOG.isWarnEnabled()) {
LOG.warn("Couldn't filter url: " + url + " (" + e.getMessage()
+ ")");
}
}
}
CrawlDatum crawlDatum = (CrawlDatum)value;
if (crawlDatum.getStatus() == CrawlDatum.STATUS_DB_GONE ||
crawlDatum.getStatus() == CrawlDatum.STATUS_DB_REDIR_PERM)
return; // don't retry
if (crawlDatum.getFetchTime() > curTime)
return; // not time yet
LongWritable oldGenTime = (LongWritable)crawlDatum.getMetaData().get(Nutch.WRITABLE_GENERATE_TIME_KEY);
if (oldGenTime != null) { // awaiting fetch & update
if (oldGenTime.get() + genDelay > curTime) // still wait for update
return;
}
float sort = 1.0f;
try {
sort = scfilters.generatorSortValue((Text)key, crawlDatum, sort);
} catch (ScoringFilterException sfe) {
if (LOG.isWarnEnabled()) {
LOG.warn("Couldn't filter generatorSortValue for " + key + ": " + sfe);
}
}
// sort by decreasing score, using DecreasingFloatComparator
sortValue.set(sort);
// record generation time
crawlDatum.getMetaData().put(Nutch.WRITABLE_GENERATE_TIME_KEY, genTime);
entry.datum = crawlDatum;
entry.url = (Text)key;
output.collect(sortValue, entry); // invert for sort by score
}
/** Partition by host. */
public int getPartition(WritableComparable key, Writable value,
int numReduceTasks) {
return hostPartitioner.getPartition(((SelectorEntry)value).url, key,
numReduceTasks);
}
/** Collect until limit is reached. */
public void reduce(WritableComparable key, Iterator values,
OutputCollector output, Reporter reporter)
throws IOException {
while (values.hasNext() && count < limit) {
SelectorEntry entry = (SelectorEntry)values.next();
Text url = entry.url;
if (maxPerHost > 0) { // are we counting hosts?
URL u = new URL(url.toString());
String host = u.getHost();
if (host == null) {
// unknown host, skip
continue;
}
host = host.toLowerCase();
if (byIP) {
try {
InetAddress ia = InetAddress.getByName(host);
host = ia.getHostAddress();
} catch (UnknownHostException uhe) {
if (LOG.isDebugEnabled()) {
LOG.debug("DNS lookup failed: " + host + ", skipping.");
}
dnsFailure++;
if ((dnsFailure % 1000 == 0) && (LOG.isWarnEnabled())) {
LOG.warn("DNS failures: " + dnsFailure);
}
continue;
}
}
u = new URL(u.getProtocol(), host, u.getPort(), u.getFile());
String urlString = u.toString();
try {
urlString = normalizers.normalize(urlString, URLNormalizers.SCOPE_GENERATE_HOST_COUNT);
host = new URL(urlString).getHost();
} catch (Exception e) {
LOG.warn("Malformed URL: '" + urlString + "', skipping (" +
StringUtils.stringifyException(e) + ")");
continue;
}
IntWritable hostCount = (IntWritable)hostCounts.get(host);
if (hostCount == null) {
hostCount = new IntWritable();
hostCounts.put(host, hostCount);
}
// increment hostCount
hostCount.set(hostCount.get() + 1);
// skip URL if above the limit per host.
if (hostCount.get() > maxPerHost) {
if (hostCount.get() == maxPerHost + 1) {
if (LOG.isInfoEnabled()) {
LOG.info("Host " + host + " has more than " + maxPerHost +
" URLs." + " Skipping additional.");
}
}
continue;
}
}
output.collect(key, entry);
// Count is incremented only when we keep the URL
// maxPerHost may cause us to skip it.
count++;
}
}
}
public static class DecreasingFloatComparator extends FloatWritable.Comparator {
/** Compares two FloatWritables decreasing. */
public int compare(byte[] b1, int s1, int l1,
byte[] b2, int s2, int l2) {
return super.compare(b2, s2, l2, b1, s1, l1);
}
}
public static class SelectorInverseMapper extends MapReduceBase implements Mapper {
public void map(WritableComparable key, Writable value, OutputCollector output, Reporter reporter) throws IOException {
SelectorEntry entry = (SelectorEntry)value;
output.collect(entry.url, entry.datum);
}
}
/** Sort fetch lists by hash of URL. */
public static class HashComparator extends WritableComparator {
public HashComparator() {
super(Text.class);
}
public int compare(WritableComparable a, WritableComparable b) {
Text url1 = (Text) a;
Text url2 = (Text) b;
int hash1 = hash(url1.getBytes(), 0, url1.getLength());
int hash2 = hash(url2.getBytes(), 0, url2.getLength());
return (hash1 < hash2 ? -1 : (hash1 == hash2 ? 0 : 1));
}
public int compare(byte[] b1, int s1, int l1, byte[] b2, int s2, int l2) {
int hash1 = hash(b1, s1, l1);
int hash2 = hash(b2, s2, l2);
return (hash1 < hash2 ? -1 : (hash1 == hash2 ? 0 : 1));
}
private static int hash(byte[] bytes, int start, int length) {
int hash = 1;
// make later bytes more significant in hash code, so that sorting by
// hashcode correlates less with by-host ordering.
for (int i = length - 1; i >= 0; i--)
hash = (31 * hash) + (int) bytes[start + i];
return hash;
}
}
/**
* Update the CrawlDB so that the next generate won't include the same URLs.
*/
public static class CrawlDbUpdater extends MapReduceBase implements Mapper, Reducer {
long generateTime;
public void configure(JobConf job) {
generateTime = job.getLong(Nutch.GENERATE_TIME_KEY, 0L);
}
public void map(WritableComparable key, Writable value, OutputCollector output, Reporter reporter) throws IOException {
if (key instanceof FloatWritable) { // tempDir source
SelectorEntry se = (SelectorEntry)value;
output.collect(se.url, se.datum);
} else {
output.collect(key, value);
}
}
public void reduce(WritableComparable key, Iterator values, OutputCollector output, Reporter reporter) throws IOException {
CrawlDatum orig = null;
LongWritable genTime = null;
while (values.hasNext()) {
CrawlDatum val = (CrawlDatum)values.next();
if (val.getMetaData().containsKey(Nutch.WRITABLE_GENERATE_TIME_KEY)) {
genTime = (LongWritable)val.getMetaData().get(Nutch.WRITABLE_GENERATE_TIME_KEY);
if (genTime.get() != generateTime) {
orig = val;
genTime = null;
continue;
}
} else {
orig = val;
}
}
if (genTime != null) {
orig.getMetaData().put(Nutch.WRITABLE_GENERATE_TIME_KEY, genTime);
}
output.collect(key, orig);
}
}
public Generator() {
}
public Generator(Configuration conf) {
setConf(conf);
}
/** Generate fetchlists in a segment. */
public Path generate(Path dbDir, Path segments)
throws IOException {
return generate(dbDir, segments, -1, Long.MAX_VALUE, System
.currentTimeMillis(), true, false);
}
/**
* Generate fetchlists in a segment.
* @return Path to generated segment or null if no entries were selected.
* */
public Path generate(Path dbDir, Path segments,
int numLists, long topN, long curTime, boolean filter,
boolean force)
throws IOException {
Path tempDir =
new Path(getConf().get("mapred.temp.dir", ".") +
"/generate-temp-"+ System.currentTimeMillis());
Path segment = new Path(segments, generateSegmentName());
Path output = new Path(segment, CrawlDatum.GENERATE_DIR_NAME);
Path lock = new Path(dbDir, CrawlDb.LOCK_NAME);
FileSystem fs = FileSystem.get(getConf());
LockUtil.createLockFile(fs, lock, force);
LOG.info("Generator: Selecting best-scoring urls due for fetch.");
LOG.info("Generator: starting");
LOG.info("Generator: segment: " + segment);
LOG.info("Generator: filtering: " + filter);
if (topN != Long.MAX_VALUE) {
LOG.info("Generator: topN: " + topN);
}
// map to inverted subset due for fetch, sort by score
JobConf job = new NutchJob(getConf());
job.setJobName("generate: select " + segment);
if (numLists == -1) { // for politeness make
numLists = job.getNumMapTasks(); // a partition per fetch task
}
if ("local".equals(job.get("mapred.job.tracker")) && numLists != 1) {
// override
LOG.info("Generator: jobtracker is 'local', generating exactly one partition.");
numLists = 1;
}
job.setLong(CRAWL_GEN_CUR_TIME, curTime);
// record real generation time
long generateTime = System.currentTimeMillis();
job.setLong(Nutch.GENERATE_TIME_KEY, generateTime);
job.setLong(CRAWL_TOP_N, topN);
job.setBoolean(CRAWL_GENERATE_FILTER, filter);
job.setInputPath(new Path(dbDir, CrawlDb.CURRENT_NAME));
job.setInputFormat(SequenceFileInputFormat.class);
job.setMapperClass(Selector.class);
job.setPartitionerClass(Selector.class);
job.setReducerClass(Selector.class);
job.setOutputPath(tempDir);
job.setOutputFormat(SequenceFileOutputFormat.class);
job.setOutputKeyClass(FloatWritable.class);
job.setOutputKeyComparatorClass(DecreasingFloatComparator.class);
job.setOutputValueClass(SelectorEntry.class);
try {
JobClient.runJob(job);
} catch (IOException e) {
LockUtil.removeLockFile(fs, lock);
throw e;
}
// check that we selected at least some entries ...
SequenceFile.Reader[] readers = SequenceFileOutputFormat.getReaders(job, tempDir);
if (readers == null || readers.length == 0 || !readers[0].next(new FloatWritable())) {
LOG.warn("Generator: 0 records selected for fetching, exiting ...");
LockUtil.removeLockFile(fs, lock);
fs.delete(tempDir);
return null;
}
for (int i = 0; i < readers.length; i++) readers[i].close();
// invert again, paritition by host, sort by url hash
if (LOG.isInfoEnabled()) {
LOG.info("Generator: Partitioning selected urls by host, for politeness.");
}
job = new NutchJob(getConf());
job.setJobName("generate: partition " + segment);
job.setInt("partition.url.by.host.seed", new Random().nextInt());
job.setInputPath(tempDir);
job.setInputFormat(SequenceFileInputFormat.class);
job.setMapperClass(SelectorInverseMapper.class);
job.setPartitionerClass(PartitionUrlByHost.class);
job.setNumReduceTasks(numLists);
job.setOutputPath(output);
job.setOutputFormat(SequenceFileOutputFormat.class);
job.setOutputKeyClass(Text.class);
job.setOutputValueClass(CrawlDatum.class);
job.setOutputKeyComparatorClass(HashComparator.class);
try {
JobClient.runJob(job);
} catch (IOException e) {
LockUtil.removeLockFile(fs, lock);
fs.delete(tempDir);
throw e;
}
if (getConf().getBoolean(GENERATE_UPDATE_CRAWLDB, false)) {
// update the db from tempDir
Path tempDir2 =
new Path(getConf().get("mapred.temp.dir", ".") +
"/generate-temp-"+ System.currentTimeMillis());
job = new NutchJob(getConf());
job.setJobName("generate: updatedb " + dbDir);
job.setLong(Nutch.GENERATE_TIME_KEY, generateTime);
job.addInputPath(tempDir);
job.addInputPath(new Path(dbDir, CrawlDb.CURRENT_NAME));
job.setInputFormat(SequenceFileInputFormat.class);
job.setMapperClass(CrawlDbUpdater.class);
job.setReducerClass(CrawlDbUpdater.class);
job.setOutputFormat(MapFileOutputFormat.class);
job.setOutputKeyClass(Text.class);
job.setOutputValueClass(CrawlDatum.class);
job.setOutputPath(tempDir2);
try {
JobClient.runJob(job);
CrawlDb.install(job, dbDir);
} catch (IOException e) {
LockUtil.removeLockFile(fs, lock);
fs.delete(tempDir);
fs.delete(tempDir2);
throw e;
}
fs.delete(tempDir2);
}
LockUtil.removeLockFile(fs, lock);
fs.delete(tempDir);
if (LOG.isInfoEnabled()) { LOG.info("Generator: done."); }
return segment;
}
private static SimpleDateFormat sdf = new SimpleDateFormat("yyyyMMddHHmmss");
public static synchronized String generateSegmentName() {
try {
Thread.sleep(1000);
} catch (Throwable t) {};
return sdf.format
(new Date(System.currentTimeMillis()));
}
/**
* Generate a fetchlist from the crawldb.
*/
public static void main(String args[]) throws Exception {
int res = new Generator().doMain(NutchConfiguration.create(), args);
System.exit(res);
}
public int run(String[] args) throws Exception {
if (args.length < 2) {
System.out.println("Usage: Generator <crawldb> <segments_dir> [-force] [-topN N] [-numFetchers numFetchers] [-adddays numDays] [-noFilter]");
return -1;
}
Path dbDir = new Path(args[0]);
Path segmentsDir = new Path(args[1]);
long curTime = System.currentTimeMillis();
long topN = Long.MAX_VALUE;
int numFetchers = -1;
boolean filter = true;
boolean force = false;
for (int i = 2; i < args.length; i++) {
if ("-topN".equals(args[i])) {
topN = Long.parseLong(args[i+1]);
i++;
} else if ("-numFetchers".equals(args[i])) {
numFetchers = Integer.parseInt(args[i+1]);
i++;
} else if ("-adddays".equals(args[i])) {
long numDays = Integer.parseInt(args[i+1]);
curTime += numDays * 1000L * 60 * 60 * 24;
} else if ("-noFilter".equals(args[i])) {
filter = false;
} else if ("-force".equals(args[i])) {
force = true;
}
}
try {
Path seg = generate(dbDir, segmentsDir, numFetchers, topN, curTime, filter, force);
if (seg == null) return -2;
else return 0;
} catch (Exception e) {
LOG.fatal("Generator: " + StringUtils.stringifyException(e));
return -1;
}
}
}