/*
* This file is part of the Heritrix web crawler (crawler.archive.org).
*
* Licensed to the Internet Archive (IA) by one or more individual
* contributors.
*
* The IA licenses this file to You under the Apache License, Version 2.0
* (the "License"); you may not use this file except in compliance with
* the License. You may obtain a copy of the License at
*
* http://www.apache.org/licenses/LICENSE-2.0
*
* Unless required by applicable law or agreed to in writing, software
* distributed under the License is distributed on an "AS IS" BASIS,
* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
* See the License for the specific language governing permissions and
* limitations under the License.
*/
package org.archive.crawler.reporting;
import static org.archive.modules.CoreAttributeConstants.A_SOURCE_TAG;
import java.io.File;
import java.io.FileWriter;
import java.io.IOException;
import java.io.PrintWriter;
import java.util.Date;
import java.util.Iterator;
import java.util.LinkedList;
import java.util.List;
import java.util.Map;
import java.util.concurrent.ConcurrentHashMap;
import java.util.concurrent.ConcurrentMap;
import java.util.concurrent.Executors;
import java.util.concurrent.ScheduledExecutorService;
import java.util.concurrent.TimeUnit;
import java.util.concurrent.atomic.AtomicLong;
import java.util.logging.Level;
import java.util.logging.Logger;
import org.apache.commons.collections.Closure;
import org.archive.bdb.BdbModule;
import org.archive.bdb.DisposableStoredSortedMap;
import org.archive.checkpointing.Checkpoint;
import org.archive.checkpointing.Checkpointable;
import org.archive.crawler.event.CrawlStateEvent;
import org.archive.crawler.event.CrawlURIDispositionEvent;
import org.archive.crawler.event.StatSnapshotEvent;
import org.archive.crawler.framework.CrawlController;
import org.archive.crawler.framework.Engine;
import org.archive.crawler.util.CrawledBytesHistotable;
import org.archive.crawler.util.TopNSet;
import org.archive.modules.CrawlURI;
import org.archive.modules.net.CrawlHost;
import org.archive.modules.net.ServerCache;
import org.archive.modules.seeds.SeedListener;
import org.archive.modules.seeds.SeedModule;
import org.archive.spring.ConfigPath;
import org.archive.util.ArchiveUtils;
import org.archive.util.FileUtils;
import org.archive.util.JSONUtils;
import org.archive.util.MimetypeUtils;
import org.archive.util.ObjectIdentityCache;
import org.archive.util.ObjectIdentityMemCache;
import org.archive.util.PaddingStringBuffer;
import org.archive.util.Supplier;
import org.json.JSONException;
import org.json.JSONObject;
import org.springframework.beans.BeansException;
import org.springframework.beans.factory.BeanNameAware;
import org.springframework.beans.factory.annotation.Autowired;
import org.springframework.context.ApplicationContext;
import org.springframework.context.ApplicationContextAware;
import org.springframework.context.ApplicationEvent;
import org.springframework.context.ApplicationListener;
import org.springframework.context.Lifecycle;
import org.xbill.DNS.DClass;
import org.xbill.DNS.Lookup;
import com.sleepycat.je.DatabaseException;
/**
* This is an implementation of the AbstractTracker. It is designed to function
* with the WUI as well as performing various logging activity.
* <p>
* At the end of each snapshot a line is written to the
* 'progress-statistics.log' file.
* <p>
* The header of that file is as follows:
* <pre> [timestamp] [discovered] [queued] [downloaded] [doc/s(avg)] [KB/s(avg)] [dl-failures] [busy-thread] [mem-use-KB]</pre>
* First there is a <b>timestamp</b>, accurate down to 1 second.
* <p>
* <b>discovered</b>, <b>queued</b>, <b>downloaded</b> and <b>dl-failures</b>
* are (respectively) the discovered URI count, pending URI count, successfully
* fetched count and failed fetch count from the frontier at the time of the
* snapshot.
* <p>
* <b>KB/s(avg)</b> is the bandwidth usage. We use the total bytes downloaded
* to calculate average bandwidth usage (KB/sec). Since we also note the value
* each time a snapshot is made we can calculate the average bandwidth usage
* during the last snapshot period to gain a "current" rate. The first number is
* the current and the average is in parenthesis.
* <p>
* <b>doc/s(avg)</b> works the same way as doc/s except it show the number of
* documents (URIs) rather then KB downloaded.
* <p>
* <b>busy-threads</b> is the total number of ToeThreads that are not available
* (and thus presumably busy processing a URI). This information is extracted
* from the crawl controller.
* <p>
* Finally mem-use-KB is extracted from the run time environment
* (<code>Runtime.getRuntime().totalMemory()</code>).
* <p>
* In addition to the data collected for the above logs, various other data
* is gathered and stored by this tracker.
* <ul>
* <li> Successfully downloaded documents per fetch status code
* <li> Successfully downloaded documents per document mime type
* <li> Amount of data per mime type
* <li> Successfully downloaded documents per host
* <li> Amount of data per host
* <li> Disposition of all seeds (this is written to 'reports.log' at end of
* crawl)
* <li> Successfully downloaded documents per host per source
* </ul>
*
* @contributor Parker Thompson
* @contributor Kristinn Sigurdsson
* @contributor gojomo
*/
public class StatisticsTracker
implements
ApplicationContextAware,
ApplicationListener<ApplicationEvent>,
SeedListener,
Lifecycle,
Runnable,
Checkpointable,
BeanNameAware {
@SuppressWarnings("unused")
private static final long serialVersionUID = 5L;
protected SeedModule seeds;
public SeedModule getSeeds() {
return this.seeds;
}
@Autowired
public void setSeeds(SeedModule seeds) {
this.seeds = seeds;
}
protected BdbModule bdb;
@Autowired
public void setBdbModule(BdbModule bdb) {
this.bdb = bdb;
}
protected ConfigPath reportsDir = new ConfigPath(Engine.REPORTS_DIR_NAME,"${launchId}/reports");
public ConfigPath getReportsDir() {
return reportsDir;
}
public void setReportsDir(ConfigPath reportsDir) {
this.reportsDir = reportsDir;
}
protected ServerCache serverCache;
public ServerCache getServerCache() {
return this.serverCache;
}
@Autowired
public void setServerCache(ServerCache serverCache) {
this.serverCache = serverCache;
}
protected int liveHostReportSize = 20;
public int getLiveHostReportSize() {
return liveHostReportSize;
}
public void setLiveHostReportSize(int liveHostReportSize) {
this.liveHostReportSize = liveHostReportSize;
}
protected ApplicationContext appCtx;
public void setApplicationContext(ApplicationContext appCtx) throws BeansException {
this.appCtx = appCtx;
}
/**
* Messages from the StatisticsTracker.
*/
private final static Logger logger =
Logger.getLogger(StatisticsTracker.class.getName());
/**
* Whether to maintain seed disposition records (expensive in
* crawls with millions of seeds)
*/
protected boolean trackSeeds = true;
public boolean getTrackSeeds() {
return this.trackSeeds;
}
public void setTrackSeeds(boolean trackSeeds) {
this.trackSeeds = trackSeeds;
}
/**
* Whether to maintain hosts-per-source-tag records for; very expensive in
* crawls with large numbers of source-tags (seeds) or large crawls
* over many hosts
*/
protected boolean trackSources = true;
public boolean getTrackSources() {
return this.trackSources;
}
public void setTrackSources(boolean trackSources) {
this.trackSources = trackSources;
}
/**
* The interval between writing progress information to log.
*/
protected int intervalSeconds = 20;
public int getIntervalSeconds() {
return this.intervalSeconds;
}
public void setIntervalSeconds(int interval) {
this.intervalSeconds = interval;
}
/**
* Number of crawl-stat sample snapshots to keep for calculation
* purposes.
*/
protected int keepSnapshotsCount = 5;
public int getKeepSnapshotsCount() {
return this.keepSnapshotsCount;
}
public void setKeepSnapshotsCount(int count) {
this.keepSnapshotsCount = count;
}
protected CrawlController controller;
public CrawlController getCrawlController() {
return this.controller;
}
@Autowired
public void setCrawlController(CrawlController controller) {
this.controller = controller;
}
/** wall-clock time the crawl started */
protected long crawlStartTime;
/** wall-clock time the crawl ended */
protected long crawlEndTime = -1; // Until crawl ends, this value is -1.
/** wall-clock time of last pause, while pause in progres */
protected long crawlPauseStarted = 0;
/** duration tally of all time spent in paused state */
protected long crawlTotalPausedTime = 0;
/** snapshots of crawl tallies and rates */
protected LinkedList<CrawlStatSnapshot> snapshots = new LinkedList<CrawlStatSnapshot>();
protected ScheduledExecutorService executor = Executors.newSingleThreadScheduledExecutor();
/*
* Cumulative data
*/
/** tally sizes novel, verified (same hash), vouched (not-modified) */
protected CrawledBytesHistotable crawledBytes = new CrawledBytesHistotable();
public CrawledBytesHistotable getCrawledBytes() {
return crawledBytes;
}
// TODO: fortify these against key explosion with bigmaps like other tallies
/** Keep track of the file types we see (mime type -> count) */
protected ConcurrentMap<String,AtomicLong> mimeTypeDistribution
= new ConcurrentHashMap<String, AtomicLong>();
protected ConcurrentMap<String,AtomicLong> mimeTypeBytes
= new ConcurrentHashMap<String, AtomicLong>();
/** Keep track of fetch status codes */
protected ConcurrentMap<String,AtomicLong> statusCodeDistribution
= new ConcurrentHashMap<String, AtomicLong>();
/** Keep track of URL counts per host per seed */
// TODO: restore spill-to-disk, like with processedSeedsRecords
protected ConcurrentHashMap<String, ConcurrentMap<String, AtomicLong>> sourceHostDistribution =
new ConcurrentHashMap<String, ConcurrentMap<String,AtomicLong>>();
/* Keep track of 'top' hosts for live reports */
protected TopNSet hostsDistributionTop;
protected TopNSet hostsBytesTop;
protected TopNSet hostsLastFinishedTop;
/**
* Record of seeds and latest results
*/
protected ObjectIdentityCache<SeedRecord> processedSeedsRecords =
new ObjectIdentityMemCache<SeedRecord>();
protected long seedsTotal = -1;
protected long seedsCrawled = -1;
public StatisticsTracker() {
}
protected List<Report> reports;
public List<Report> getReports() {
// lazy initialization so we don't pointlessly create a bunch of beans
// right before setReports is called
if (reports == null) {
reports = new LinkedList<Report>();
reports.add(new CrawlSummaryReport());
reports.add(new SeedsReport());
reports.add(new HostsReport());
reports.add(new SourceTagsReport());
reports.add(new MimetypesReport());
reports.add(new ResponseCodeReport());
reports.add(new ProcessorsReport());
reports.add(new FrontierSummaryReport());
reports.add(new ToeThreadsReport());
}
return reports;
}
public void setReports(List<Report> reports) {
this.reports = reports;
}
protected boolean isRunning = false;
public boolean isRunning() {
return isRunning;
}
public void stop() {
isRunning = false;
executor.shutdownNow();
progressStatisticsEvent();
dumpReports();
}
@SuppressWarnings("unchecked")
public void start() {
isRunning = true;
boolean isRecover = (recoveryCheckpoint != null);
try {
this.processedSeedsRecords = bdb.getObjectCache("processedSeedsRecords",
isRecover, SeedRecord.class);
this.hostsDistributionTop = new TopNSet(getLiveHostReportSize());
this.hostsBytesTop = new TopNSet(getLiveHostReportSize());
this.hostsLastFinishedTop = new TopNSet(getLiveHostReportSize());
if(isRecover) {
JSONObject json = recoveryCheckpoint.loadJson(beanName);
crawlStartTime = json.getLong("crawlStartTime");
crawlEndTime = json.getLong("crawlEndTime");
crawlTotalPausedTime = json.getLong("crawlTotalPausedTime");
crawlPauseStarted = json.getLong("crawlPauseStarted");
tallyCurrentPause();
JSONUtils.putAllLongs(
hostsDistributionTop.getTopSet(),
json.getJSONObject("hostsDistributionTop"));
hostsDistributionTop.updateBounds();
JSONUtils.putAllLongs(
hostsBytesTop.getTopSet(),
json.getJSONObject("hostsBytesTop"));
hostsBytesTop.updateBounds();
JSONUtils.putAllLongs(
hostsLastFinishedTop.getTopSet(),
json.getJSONObject("hostsLastFinishedTop"));
hostsLastFinishedTop.updateBounds();
JSONUtils.putAllAtomicLongs(
mimeTypeDistribution,
json.getJSONObject("mimeTypeDistribution"));
JSONUtils.putAllAtomicLongs(
mimeTypeBytes,
json.getJSONObject("mimeTypeBytes"));
JSONUtils.putAllAtomicLongs(
statusCodeDistribution,
json.getJSONObject("statusCodeDistribution"));
JSONObject shd = json.getJSONObject("sourceHostDistribution");
Iterator<String> keyIter = shd.keys();
for(; keyIter.hasNext();) {
String source = keyIter.next();
ConcurrentHashMap<String, AtomicLong> hostUriCount = new ConcurrentHashMap<String, AtomicLong>();
JSONUtils.putAllAtomicLongs(hostUriCount,shd.getJSONObject(source));
sourceHostDistribution.put(source, hostUriCount);
}
JSONUtils.putAllLongs(
crawledBytes,
json.getJSONObject("crawledBytes"));
}
} catch (DatabaseException e) {
throw new IllegalStateException(e);
} catch (JSONException e) {
throw new IllegalStateException(e);
}
// Log the legend
this.controller.logProgressStatistics(progressStatisticsLegend());
executor.scheduleAtFixedRate(this, 0, getIntervalSeconds(), TimeUnit.SECONDS);
}
/**
* Do activity. Is called by ScheduledExecutorService at intervals specified by
* intervalSeconds
*
*/
public void run() {
progressStatisticsEvent();
}
/**
* @return legend for progress-statistics lines/log
*/
public String progressStatisticsLegend() {
return " timestamp" +
" discovered " +
" queued downloaded doc/s(avg) KB/s(avg) " +
" dl-failures busy-thread mem-use-KB heap-size-KB " +
" congestion max-depth avg-depth";
}
public String getProgressStamp() {
return
progressStatisticsLegend()
+ "\n"
+ getSnapshot().getProgressStatisticsLine();
}
/**
* Notify tracker that crawl has begun. Must be called
* outside tracker's own thread, to ensure it is noted
* before other threads start interacting with tracker.
*/
public void noteStart() {
if (this.crawlStartTime == 0) {
// Note the time the crawl starts (only if not already set)
this.crawlStartTime = System.currentTimeMillis();
}
}
/**
* A method for logging current crawler state.
*
* This method will be called by run() at intervals specified in
* the crawl order file. It is also invoked when pausing or
* stopping a crawl to capture the state at that point. Default behavior is
* call to {@link CrawlController#logProgressStatistics} so CrawlController
* can act on progress statistics event.
* <p>
* It is recommended that for implementations of this method it be
* carefully considered if it should be synchronized in whole or in
* part
* @param e Progress statistics event.
*/
protected synchronized void progressStatisticsEvent() {
CrawlStatSnapshot snapshot = getSnapshot();
if (this.controller != null) {
this.controller.logProgressStatistics(snapshot.getProgressStatisticsLine());
}
snapshots.addFirst(snapshot);
while(snapshots.size()>getKeepSnapshotsCount()) {
snapshots.removeLast();
}
// publish app event
appCtx.publishEvent(new StatSnapshotEvent(this,snapshot));
// temporary workaround for
// [ 996161 ] Fix DNSJava issues (memory) -- replace with JNDI-DNS?
// http://sourceforge.net/support/tracker.php?aid=996161
Lookup.getDefaultCache(DClass.IN).clearCache();
}
public CrawlStatSnapshot getSnapshot() {
// TODO: take snapshot implementation from a spring prototype?
CrawlStatSnapshot snapshot = new CrawlStatSnapshot();
snapshot.collect(controller,this);
return snapshot;
}
public LinkedList<CrawlStatSnapshot> listSnapshots() {
// not named getSnapshots to avoid autodiscovery as a (invalid) bean-property
return snapshots;
}
public CrawlStatSnapshot getLastSnapshot() {
CrawlStatSnapshot snap = snapshots.peek();
return snap == null ? getSnapshot() : snap;
}
public long getCrawlElapsedTime() {
if (crawlStartTime == 0) {
// if no start time set yet, consider elapsed time zero
return 0;
}
if (crawlPauseStarted != 0) {
// currently paused, calculate time up to last pause
return crawlPauseStarted - crawlTotalPausedTime - crawlStartTime;
}
// not paused, calculate total time to end or (if running) now
return ((crawlEndTime>0)?crawlEndTime:System.currentTimeMillis())
- crawlTotalPausedTime - crawlStartTime;
}
public void crawlPausing(String statusMessage) {
logNote("CRAWL WAITING - " + statusMessage);
}
protected void logNote(final String note) {
this.controller.logProgressStatistics(new PaddingStringBuffer()
.append(ArchiveUtils.getLog14Date(new Date()))
.append(" ")
.append(note)
.toString());
}
public void crawlPaused(String statusMessage) {
crawlPauseStarted = System.currentTimeMillis();
progressStatisticsEvent();
logNote("CRAWL PAUSED - " + statusMessage);
}
public void crawlResuming(String statusMessage) {
tallyCurrentPause();
if (this.crawlStartTime == 0) {
noteStart();
}
logNote("CRAWL RUNNING - " + statusMessage);
}
public void crawlEmpty(String statusMessage) {
logNote("CRAWL EMPTY - " + statusMessage);
}
/**
* For a current pause (if any), add paused time to total and reset
*/
protected void tallyCurrentPause() {
if (this.crawlPauseStarted > 0) {
// Ok, we managed to actually pause before resuming.
this.crawlTotalPausedTime
+= (System.currentTimeMillis() - this.crawlPauseStarted);
}
this.crawlPauseStarted = 0;
}
public void crawlEnding(String sExitMessage) {
logNote("CRAWL ENDING - " + sExitMessage);
}
public void crawlEnded(String sExitMessage) {
crawlEndTime = System.currentTimeMillis();
logNote("CRAWL ENDED - " + sExitMessage);
}
/**
* Returns how long the current crawl has been running *including*
* time paused (contrast with getCrawlElapsedTime()).
*
* @return The length of time - in msec - that this crawl has been running.
*/
public long getCrawlDuration() {
return ((crawlEndTime>0)?crawlEndTime:System.currentTimeMillis())
- crawlStartTime;
}
/** Returns a HashMap that contains information about distributions of
* encountered mime types. Key/value pairs represent
* mime type -> count.
* <p>
* <b>Note:</b> All the values are wrapped with a {@link AtomicLong AtomicLong}
* @return mimeTypeDistribution
*/
public Map<String, AtomicLong> getFileDistribution() {
return mimeTypeDistribution;
}
/**
* Increment a counter for a key in a given HashMap. Used for various
* aggregate data.
*
* @param map The Map or ConcurrentMap
* @param key The key for the counter to be incremented, if it does not
* exist it will be added (set to 1). If null it will
* increment the counter "unknown".
*/
protected static void incrementMapCount(ConcurrentMap<String,AtomicLong> map,
String key) {
incrementMapCount(map,key,1);
}
/**
* Increment a counter for a key in a given HashMap by an arbitrary amount.
* Used for various aggregate data. The increment amount can be negative.
*
*
* @param map
* The HashMap
* @param key
* The key for the counter to be incremented, if it does not exist
* it will be added (set to equal to <code>increment</code>).
* If null it will increment the counter "unknown".
* @param increment
* The amount to increment counter related to the <code>key</code>.
*/
protected static void incrementMapCount(ConcurrentMap<String,AtomicLong> map,
String key, long increment) {
if (key == null) {
key = "unknown";
}
AtomicLong lw = (AtomicLong)map.get(key);
if(lw == null) {
lw = new AtomicLong(0);
AtomicLong prevVal = map.putIfAbsent(key, lw);
if(prevVal != null) {
lw = prevVal;
}
}
lw.addAndGet(increment);
}
/**
* Sort the entries of the given Map in descending order by their
* values, which must be longs wrapped with <code>AtomicLong</code>.
* <p>
* Elements are sorted by value from largest to smallest. Equal values are
* sorted by their keys. The returned map is a StoredSortedMap, and
* thus may include duplicate keys.
*
* If the passed-in map requires access to be synchronized, the caller
* should ensure this synchronization.
*
* @param mapOfAtomicLongValues
* Assumes values are wrapped with AtomicLong.
* @return a sorted set containing the same elements as the map.
*/
public DisposableStoredSortedMap<Long,String> getReverseSortedCopy(
final Map<String,AtomicLong> mapOfAtomicLongValues) {
DisposableStoredSortedMap<Long,String> sortedMap =
bdb.getStoredMap(
null,
Long.class,
String.class,
true,
false);
for(String k : mapOfAtomicLongValues.keySet()) {
sortedMap.put(-mapOfAtomicLongValues.get(k).longValue(), k);
}
return sortedMap;
}
/**
* Return a objectCache representing the distribution of status codes for
* successfully fetched curis, as represented by a cache where key ->
* val represents (string)code -> (integer)count.
*
* <b>Note: </b> All the values are wrapped with a
* {@link AtomicLong AtomicLong}
* @return
*
* @return statusCodeDistribution
*/
public Map<String, AtomicLong> getStatusCodeDistribution() {
return statusCodeDistribution;
}
/**
* Returns the time (in millisec) when a URI belonging to a given host was
* last finished processing.
*
* @param host The host to look up time of last completed URI.
* @return Returns the time (in millisec) when a URI belonging to a given
* host was last finished processing. If no URI has been completed for host
* -1 will be returned.
*/
public long getHostLastFinished(String host){
return serverCache.getHostFor(host).getSubstats().getLastSuccessTime();
}
/**
* Returns the accumulated number of bytes downloaded from a given host.
* @param host name of the host
* @return the accumulated number of bytes downloaded from a given host
*/
public long getBytesPerHost(String host){
return serverCache.getHostFor(host).getSubstats().getTotalBytes();
}
/**
* Returns the accumulated number of bytes from files of a given file type.
* @param filetype Filetype to check.
* @return the accumulated number of bytes from files of a given mime type
*/
public long getBytesPerFileType(String filetype){
return getReportValue(mimeTypeBytes, filetype);
}
/**
* Get the total number of ToeThreads (sleeping and active)
*
* @return The total number of ToeThreads
*/
public int threadCount() {
return this.controller != null? controller.getToeCount(): 0;
}
public String crawledBytesSummary() {
return crawledBytes.summary();
}
/**
* If the curi is a seed, we update the processedSeeds cache.
*
* @param curi The CrawlURI that may be a seed.
* @param disposition The disposition of the CrawlURI.
*/
protected void handleSeed(final CrawlURI curi, final String disposition) {
if(getTrackSeeds()) {
if(curi.isSeed()){
SeedRecord sr = processedSeedsRecords.getOrUse(
curi.getURI(),
new Supplier<SeedRecord>() {
public SeedRecord get() {
return new SeedRecord(curi, disposition);
}});
sr.updateWith(curi,disposition);
}
} // else ignore
}
public void crawledURISuccessful(CrawlURI curi) {
handleSeed(curi,"Seed successfully crawled");
// save crawled bytes tally
crawledBytes.accumulate(curi);
// Save status codes
incrementMapCount(statusCodeDistribution,
Integer.toString(curi.getFetchStatus()));
// Save mime types
String mime = MimetypeUtils.truncate(curi.getContentType());
incrementMapCount(mimeTypeDistribution, mime);
incrementMapCount(mimeTypeBytes, mime, curi.getContentSize());
// Save hosts stats.
ServerCache sc = serverCache;
saveHostStats(sc.getHostFor(curi.getUURI()).getHostName(),
curi.getContentSize());
if (getTrackSources() && curi.getData().containsKey(A_SOURCE_TAG)) {
saveSourceStats((String)curi.getData().get(A_SOURCE_TAG),
sc.getHostFor(curi.getUURI()).
getHostName());
}
}
protected void saveSourceStats(String source, String hostname) {
ConcurrentMap<String,AtomicLong> hostUriCount = sourceHostDistribution.get(source);
if(hostUriCount == null) {
hostUriCount = new ConcurrentHashMap<String,AtomicLong>();
ConcurrentMap<String,AtomicLong> prevVal = sourceHostDistribution.putIfAbsent(source, hostUriCount);
if (prevVal!=null) {
hostUriCount = prevVal;
}
}
incrementMapCount(hostUriCount, hostname);
}
/**
* Update some running-stats based on a URI success
*
* @param hostname
* @param size
*/
protected void saveHostStats(String hostname, long size) {
// TODO: consider moving 'top' accounting elsewhere, such
// as the frontier or ServerCache itself
CrawlHost host = serverCache.getHostFor(hostname);
hostsDistributionTop.update(hostname, host.getSubstats().getFetchSuccesses());
hostsBytesTop.update(hostname, host.getSubstats().getSuccessBytes());
hostsLastFinishedTop.update(hostname, host.getSubstats().getLastSuccessTime());
}
public void crawledURINeedRetry(CrawlURI curi) {
handleSeed(curi,"Failed to crawl seed, will retry");
}
public void crawledURIDisregard(CrawlURI curi) {
handleSeed(curi,"Seed was disregarded");
}
public void crawledURIFailure(CrawlURI curi) {
handleSeed(curi,"Failed to crawl seed");
}
/**
* Get a seed iterator for the job being monitored. Only reports
* known seeds from processedSeedsRecords -- but as a SeedListener,
* that should be complete.
*
* <b>Note:</b> This iterator will iterate over a list of <i>strings</i> not
* UURIs like the Scope seed iterator. The strings are equal to the URIs'
* getURIString() values.
* @return the seed iterator
*/
public Iterator<String> getSeedsIterator() {
return processedSeedsRecords.keySet().iterator();
}
public DisposableStoredSortedMap<Integer,SeedRecord> calcSeedRecordsSortedByStatusCode() {
Iterator<String> i = getSeedsIterator();
DisposableStoredSortedMap<Integer,SeedRecord> sortedMap =
bdb.getStoredMap(
null,
Integer.class,
SeedRecord.class,
true,
false);
while (i.hasNext()) {
String seed = i.next();
SeedRecord sr = (SeedRecord) processedSeedsRecords.get(seed);
if(sr==null) {
sr = new SeedRecord(seed,"Seed has not been processed");
// no need to retain synthesized record
}
sortedMap.put(sr.sortShiftStatusCode(), sr);
}
return sortedMap;
}
/**
* Return a copy of the hosts distribution in reverse-sorted (largest first)
* order.
*
* @return SortedMap of hosts distribution
*/
public DisposableStoredSortedMap<Long,String> getReverseSortedHostCounts(
Map<String,AtomicLong> hostCounts) {
synchronized(hostCounts){
return getReverseSortedCopy(hostCounts);
}
}
/**
* Return a copy of the hosts distribution in reverse-sorted
* (largest first) order.
* @return SortedMap of hosts distribution
*/
public DisposableStoredSortedMap<Long,String> calcReverseSortedHostsDistribution() {
final DisposableStoredSortedMap<Long,String> sortedMap =
bdb.getStoredMap(
null,
Long.class,
String.class,
true,
false);
serverCache.forAllHostsDo(new Closure() {
@Override
public void execute(Object hostObj) {
CrawlHost host = (CrawlHost) hostObj;
sortedMap.put(-host.getSubstats().getFetchSuccesses(), host.getHostName());
}
});
return sortedMap;
}
public File writeReportFile(String reportName) {
for(Report report: getReports()) {
if (report.getClass().getSimpleName().equals(reportName)) {
return writeReportFile(report, false);
}
}
return null;
}
protected File writeReportFile(Report report, boolean force) {
File f = new File(getReportsDir().getFile(), report.getFilename());
if(f.exists() && !controller.isRunning() && controller.hasStarted() && !force) {
// controller already started and stopped
// and file exists
// and force not requested
// so, don't overwrite
logger.info("reusing report: " + f.getAbsolutePath());
return f;
}
try {
FileUtils.ensureWriteableDirectory(f.getParentFile());
PrintWriter bw = new PrintWriter(new FileWriter(f));
report.write(bw, this);
bw.close();
addToManifest(f.getAbsolutePath(),
CrawlerLoggerModule.MANIFEST_REPORT_FILE, true);
} catch (IOException e) {
logger.log(Level.SEVERE, "Unable to write " + f.getAbsolutePath() +
" at the end of crawl.", e);
}
logger.info("wrote report: " + f.getAbsolutePath());
return f;
}
protected void addToManifest(String absolutePath, char manifest_report_file, boolean b) {
// TODO Auto-generated method stub
}
/**
* Run the reports.
*/
public void dumpReports() {
// TODO: sooner than here! Add all files mentioned in the crawl
// order to the manifest set.
//controller.addOrderToManifest();
for (Report report: getReports()) {
if (report.getShouldReportAtEndOfCrawl()) {
try {
writeReportFile(report, true);
} catch (RuntimeException re) {
logger.log(Level.SEVERE, re.getMessage(), re);
}
}
}
}
public void crawlCheckpoint(/*StateProvider*/ Object def, File cpDir) throws Exception {
// CrawlController is managing the checkpointing of this object.
logNote("CRAWL CHECKPOINTING TO " + cpDir.toString());
}
private long getReportValue(Map<String,AtomicLong> map, String key) {
if (key == null) {
return -1;
}
Object o = map.get(key);
if (o == null) {
return -2;
}
if (!(o instanceof AtomicLong)) {
throw new IllegalStateException("Expected AtomicLong but got "
+ o.getClass() + " for " + key);
}
return ((AtomicLong)o).get();
}
public void onApplicationEvent(ApplicationEvent event) {
if(event instanceof CrawlStateEvent) {
CrawlStateEvent event1 = (CrawlStateEvent)event;
switch(event1.getState()) {
case PAUSED:
this.crawlPaused(event1.getMessage());
break;
case RUNNING:
this.crawlResuming(event1.getMessage());
break;
case EMPTY:
this.crawlEmpty(event1.getMessage());
break;
case PAUSING:
this.crawlPausing(event1.getMessage());
break;
case STOPPING:
this.crawlEnding(event1.getMessage());
break;
case FINISHED:
this.crawlEnded(event1.getMessage());
break;
case PREPARING:
this.crawlResuming(event1.getMessage());
break;
default:
throw new RuntimeException("Unknown state: " + event1.getState());
}
}
if(event instanceof CrawlURIDispositionEvent) {
CrawlURIDispositionEvent dvent = (CrawlURIDispositionEvent)event;
switch(dvent.getDisposition()) {
case SUCCEEDED:
this.crawledURISuccessful(dvent.getCrawlURI());
break;
case FAILED:
this.crawledURIFailure(dvent.getCrawlURI());
break;
case DISREGARDED:
this.crawledURIDisregard(dvent.getCrawlURI());
break;
case DEFERRED_FOR_RETRY:
this.crawledURINeedRetry(dvent.getCrawlURI());
break;
default:
throw new RuntimeException("Unknown disposition: " + dvent.getDisposition());
}
}
}
public void tallySeeds() {
seedsTotal = 0;
seedsCrawled = 0;
if(processedSeedsRecords==null) {
// nothing to tally
return;
}
for (Iterator<String> i = getSeedsIterator();i.hasNext();) {
SeedRecord sr = processedSeedsRecords.get(i.next());
seedsTotal++;
if(sr!=null &&(sr.getStatusCode() > 0)) {
seedsCrawled++;
}
}
}
/**
* Create a seed record, even on initial notification (before
* any real attempt/processing.
*
* @see org.archive.modules.seeds.SeedListener#addedSeed(org.archive.modules.CrawlURI)
*/
public void addedSeed(CrawlURI curi) {
// record even undisposed-seeds for reporting purposes
handleSeed((CrawlURI) curi, "");
}
/**
* Do nothing with nonseed lines.
*
* @see org.archive.modules.seeds.SeedListener#nonseedLine(java.lang.String)
*/
public boolean nonseedLine(String line) {
return false;
}
public void concludedSeedBatch() {
// do nothing;
}
// BeanNameAware
protected String beanName;
public void setBeanName(String name) {
this.beanName = name;
}
// Checkpointable
public void startCheckpoint(Checkpoint checkpointInProgress) {}
public void doCheckpoint(Checkpoint checkpointInProgress) throws IOException {
JSONObject json = new JSONObject();
try {
json.put("crawlStartTime",crawlStartTime);
json.put("crawlEndTime",crawlEndTime);
long virtualCrawlPauseStarted = crawlPauseStarted;
if(virtualCrawlPauseStarted<1) {
// TODO: use instant checkpoint started?
virtualCrawlPauseStarted = System.currentTimeMillis();
}
json.put("crawlPauseStarted",virtualCrawlPauseStarted);
json.put("crawlTotalPausedTime",crawlTotalPausedTime);
json.put("hostsDistributionTop", hostsDistributionTop.getTopSet());
json.put("hostsBytesTop", hostsBytesTop.getTopSet());
json.put("hostsLastFinishedTop", hostsLastFinishedTop.getTopSet());
json.put("mimeTypeDistribution", mimeTypeDistribution);
json.put("mimeTypeBytes", mimeTypeBytes);
json.put("statusCodeDistribution", statusCodeDistribution);
json.put("sourceHostDistribution", sourceHostDistribution);
json.put("crawledBytes", crawledBytes);
// TODO: save crawledBytesHistotable
checkpointInProgress.saveJson(beanName, json);
} catch (JSONException e) {
// impossible
throw new RuntimeException(e);
}
}
public void finishCheckpoint(Checkpoint checkpointInProgress) {}
protected Checkpoint recoveryCheckpoint;
public void setRecoveryCheckpoint(Checkpoint recoveryCheckpoint) {
this.recoveryCheckpoint = recoveryCheckpoint;
}
}