final RequestHeader requestHeader = new RequestHeader();
requestHeader.put(HeaderFramework.USER_AGENT, ClientIdentification.getUserAgent());
DigestURI refererURL = null;
if (request.referrerhash() != null) refererURL = this.sb.getURL(Segments.Process.LOCALCRAWLING, request.referrerhash());
if (refererURL != null) requestHeader.put(RequestHeader.REFERER, refererURL.toNormalform(true, true));
final Response response = new Response(
request,
requestHeader,
cachedResponse,
"200",
crawlProfile,
content);
// check which caching strategy shall be used
if (cacheStrategy == CacheStrategy.IFEXIST || cacheStrategy == CacheStrategy.CACHEONLY) {
// well, just take the cache and don't care about freshness of the content
this.log.logInfo("cache hit/useall for: " + url.toNormalform(true, false));
return response;
}
// now the cacheStrategy must be CACHE_STRATEGY_IFFRESH, that means we should do a proxy freshness test
assert cacheStrategy == CacheStrategy.IFFRESH : "cacheStrategy = " + cacheStrategy;
if (response.isFreshForProxy()) {
this.log.logInfo("cache hit/fresh for: " + url.toNormalform(true, false));
return response;
} else {
this.log.logInfo("cache hit/stale for: " + url.toNormalform(true, false));
}
} else if (cachedResponse != null) {
this.log.logWarning("HTCACHE contained response header, but not content for url " + url.toNormalform(true, false));
} else if (content != null) {
this.log.logWarning("HTCACHE contained content, but not response header for url " + url.toNormalform(true, false));
}
}
// check case where we want results from the cache exclusively, and never from the internet (offline mode)
if (cacheStrategy == CacheStrategy.CACHEONLY) {
// we had a chance to get the content from the cache .. its over. We don't have it.
throw new IOException("cache only strategy");
}
// now forget about the cache, nothing there. Try to load the content from the internet
// check access time: this is a double-check (we checked possibly already in the balancer)
// to make sure that we don't DoS the target by mistake
if (!url.isLocal()) {
final Long lastAccess = accessTime.get(host);
long wait = 0;
if (lastAccess != null) wait = Math.max(0, minDelay + lastAccess.longValue() - System.currentTimeMillis());
if (wait > 0) {
// force a sleep here. Instead just sleep we clean up the accessTime map
final long untilTime = System.currentTimeMillis() + wait;
cleanupAccessTimeTable(untilTime);
if (System.currentTimeMillis() < untilTime)
try {Thread.sleep(untilTime - System.currentTimeMillis());} catch (final InterruptedException ee) {}
}
}
// now it's for sure that we will access the target. Remember the access time
if (host != null) accessTime.put(host, System.currentTimeMillis());
// load resource from the internet
Response response = null;
if ((protocol.equals("http") || (protocol.equals("https")))) response = this.httpLoader.load(request, maxFileSize, checkBlacklist);
if (protocol.equals("ftp")) response = this.ftpLoader.load(request, true);
if (protocol.equals("smb")) response = this.smbLoader.load(request, true);
if (protocol.equals("file")) response = this.fileLoader.load(request, true);
if (response != null && response.getContent() != null) {
// we got something. Now check if we want to store that to the cache
// first check looks if we want to store the content to the cache
if (crawlProfile == null || !crawlProfile.storeHTCache()) {
// no caching wanted. Thats ok, do not write any message
return response;
}
// second check tells us if the protocoll tells us something about caching
final String storeError = response.shallStoreCacheForCrawler();
if (storeError == null) {
try {
Cache.store(url, response.getResponseHeader(), response.getContent());
} catch (final IOException e) {
this.log.logWarning("cannot write " + response.url() + " to Cache (3): " + e.getMessage(), e);
}
} else {
this.log.logWarning("cannot write " + response.url() + " to Cache (4): " + storeError);
}
return response;
}
throw new IOException("Unsupported protocol '" + protocol + "' in url " + url);