/*
* This file is part of the Heritrix web crawler (crawler.archive.org).
*
* Licensed to the Internet Archive (IA) by one or more individual
* contributors.
*
* The IA licenses this file to You under the Apache License, Version 2.0
* (the "License"); you may not use this file except in compliance with
* the License. You may obtain a copy of the License at
*
* http://www.apache.org/licenses/LICENSE-2.0
*
* Unless required by applicable law or agreed to in writing, software
* distributed under the License is distributed on an "AS IS" BASIS,
* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
* See the License for the specific language governing permissions and
* limitations under the License.
*/
package org.archive.crawler.reporting;
import java.io.File;
import java.io.IOException;
import java.io.ObjectInputStream;
import java.util.HashMap;
import java.util.Map;
import java.util.logging.FileHandler;
import java.util.logging.Formatter;
import java.util.logging.Level;
import java.util.logging.Logger;
import java.util.logging.SimpleFormatter;
import org.apache.commons.httpclient.URIException;
import org.archive.checkpointing.Checkpoint;
import org.archive.checkpointing.Checkpointable;
import org.archive.crawler.framework.Engine;
import org.archive.crawler.io.NonFatalErrorFormatter;
import org.archive.crawler.io.RuntimeErrorFormatter;
import org.archive.crawler.io.StatisticsLogFormatter;
import org.archive.crawler.io.UriErrorFormatter;
import org.archive.crawler.io.UriProcessingFormatter;
import org.archive.crawler.util.Logs;
import org.archive.io.GenerationFileHandler;
import org.archive.modules.SimpleFileLoggerProvider;
import org.archive.modules.extractor.UriErrorLoggerModule;
import org.archive.net.UURI;
import org.archive.spring.ConfigPath;
import org.archive.util.ArchiveUtils;
import org.archive.util.FileUtils;
import org.springframework.beans.factory.DisposableBean;
import org.springframework.beans.factory.InitializingBean;
import org.springframework.beans.factory.annotation.Autowired;
import org.springframework.context.Lifecycle;
/**
* Module providing all expected whole-crawl logging facilities
*
* @contributor pjack
* @contributor gojomo
*/
public class CrawlerLoggerModule
implements
UriErrorLoggerModule, Lifecycle, InitializingBean,
Checkpointable, SimpleFileLoggerProvider, DisposableBean {
@SuppressWarnings("unused")
private static final long serialVersionUID = 1L;
protected ConfigPath path = new ConfigPath(Engine.LOGS_DIR_NAME,"${launchId}/logs");
public ConfigPath getPath() {
return path;
}
public void setPath(ConfigPath cp) {
this.path.merge(cp);
}
/**
* Whether to include the "extra info" field for each entry in crawl.log.
* "Extra info" is arbitrary JSON. It is the last field of the log line.
*/
protected boolean logExtraInfo = false;
public boolean getLogExtraInfo() {
return logExtraInfo;
}
public void setLogExtraInfo(boolean logExtraInfo) {
this.logExtraInfo = logExtraInfo;
}
// manifest support
/** abbreviation label for config files in manifest */
public static final char MANIFEST_CONFIG_FILE = 'C';
/** abbreviation label for report files in manifest */
public static final char MANIFEST_REPORT_FILE = 'R';
/** abbreviation label for log files in manifest */
public static final char MANIFEST_LOG_FILE = 'L';
// key log names
private static final String LOGNAME_CRAWL = "crawl";
private static final String LOGNAME_ALERTS = "alerts";
private static final String LOGNAME_PROGRESS_STATISTICS =
"progress-statistics";
private static final String LOGNAME_URI_ERRORS = "uri-errors";
private static final String LOGNAME_RUNTIME_ERRORS = "runtime-errors";
private static final String LOGNAME_NONFATAL_ERRORS = "nonfatal-errors";
protected ConfigPath crawlLogPath =
new ConfigPath(Logs.CRAWL.getFilename(),Logs.CRAWL.getFilename());
public ConfigPath getCrawlLogPath() {
return crawlLogPath;
}
public void setCrawlLogPath(ConfigPath cp) {
this.crawlLogPath.merge(cp);
}
protected ConfigPath alertsLogPath =
new ConfigPath(Logs.ALERTS.getFilename(),Logs.ALERTS.getFilename());
public ConfigPath getAlertsLogPath() {
return alertsLogPath;
}
public void setAlertsLogPath(ConfigPath cp) {
this.alertsLogPath.merge(cp);
}
protected ConfigPath progressLogPath =
new ConfigPath(Logs.PROGRESS_STATISTICS.getFilename(),Logs.PROGRESS_STATISTICS.getFilename());
public ConfigPath getProgressLogPath() {
return progressLogPath;
}
public void setProgressLogPath(ConfigPath cp) {
this.progressLogPath.merge(cp);
}
protected ConfigPath uriErrorsLogPath =
new ConfigPath(Logs.URI_ERRORS.getFilename(),Logs.URI_ERRORS.getFilename());
public ConfigPath getUriErrorsLogPath() {
return uriErrorsLogPath;
}
public void setUriErrorsLogPath(ConfigPath cp) {
this.uriErrorsLogPath.merge(cp);
}
protected ConfigPath runtimeErrorsLogPath =
new ConfigPath(Logs.RUNTIME_ERRORS.getFilename(),Logs.RUNTIME_ERRORS.getFilename());
public ConfigPath getRuntimeErrorsLogPath() {
return runtimeErrorsLogPath;
}
public void setRuntimeErrorsLogPath(ConfigPath cp) {
this.runtimeErrorsLogPath.merge(cp);
}
protected ConfigPath nonfatalErrorsLogPath =
new ConfigPath(Logs.NONFATAL_ERRORS.getFilename(),Logs.NONFATAL_ERRORS.getFilename());
public ConfigPath getNonfatalErrorsLogPath() {
return nonfatalErrorsLogPath;
}
public void setNonfatalErrorsLogPath(ConfigPath cp) {
this.nonfatalErrorsLogPath.merge(cp);
}
/** suffix to use on active logs */
// public static final String CURRENT_LOG_SUFFIX = ".log";
/**
* Crawl progress logger.
*
* No exceptions. Logs summary result of each url processing.
*/
private transient Logger uriProcessing;
/**
* This logger contains unexpected runtime errors.
*
* Would contain errors trying to set up a job or failures inside
* processors that they are not prepared to recover from.
*/
private transient Logger runtimeErrors;
/**
* This logger is for job-scoped logging, specifically recoverable
* errors which happen and are handled within a particular processor.
*
* Examples would be socket timeouts, exceptions thrown by
* extractors, etc.
*/
private transient Logger nonfatalErrors;
/**
* Special log for URI format problems, wherever they may occur.
*/
private transient Logger uriErrors;
/**
* Statistics tracker writes here at regular intervals.
*/
private transient Logger progressStats;
/**
* Record of fileHandlers established for loggers,
* assisting file rotation.
*/
transient private Map<Logger,FileHandler> fileHandlers;
private StringBuffer manifest = new StringBuffer();
private transient AlertThreadGroup atg;
public CrawlerLoggerModule() {
}
public void start() {
if(isRunning) {
return;
}
this.atg = AlertThreadGroup.current();
try {
FileUtils.ensureWriteableDirectory(getPath().getFile());
setupLogs();
} catch (IOException e) {
throw new IllegalStateException(e);
}
isRunning = true;
}
protected boolean isRunning = false;
public boolean isRunning() {
return this.isRunning;
}
public void stop() {
isRunning = false;
}
public void destroy() {
closeLogFiles();
}
protected void setupLogs() throws IOException {
String logsPath = getPath().getFile().getAbsolutePath() + File.separatorChar;
uriProcessing = Logger.getLogger(LOGNAME_CRAWL + "." + logsPath);
runtimeErrors = Logger.getLogger(LOGNAME_RUNTIME_ERRORS + "." +
logsPath);
nonfatalErrors = Logger.getLogger(LOGNAME_NONFATAL_ERRORS + "." + logsPath);
uriErrors = Logger.getLogger(LOGNAME_URI_ERRORS + "." + logsPath);
progressStats = Logger.getLogger(LOGNAME_PROGRESS_STATISTICS + "." +
logsPath);
this.fileHandlers = new HashMap<Logger,FileHandler>();
setupLogFile(uriProcessing,
getCrawlLogPath().getFile().getAbsolutePath(),
new UriProcessingFormatter(getLogExtraInfo()), true);
setupLogFile(runtimeErrors,
getRuntimeErrorsLogPath().getFile().getAbsolutePath(),
new RuntimeErrorFormatter(getLogExtraInfo()), true);
setupLogFile(nonfatalErrors,
getNonfatalErrorsLogPath().getFile().getAbsolutePath(),
new NonFatalErrorFormatter(getLogExtraInfo()), true);
setupLogFile(uriErrors,
getUriErrorsLogPath().getFile().getAbsolutePath(),
new UriErrorFormatter(), true);
setupLogFile(progressStats,
getProgressLogPath().getFile().getAbsolutePath(),
new StatisticsLogFormatter(), true);
setupAlertLog(logsPath);
}
private void setupLogFile(Logger logger, String filename, Formatter f,
boolean shouldManifest) throws IOException, SecurityException {
logger.setLevel(Level.INFO); // set all standard loggers to INFO
GenerationFileHandler fh = GenerationFileHandler.makeNew(filename, false,
shouldManifest);
fh.setFormatter(f);
logger.addHandler(fh);
addToManifest(filename, MANIFEST_LOG_FILE, shouldManifest);
logger.setUseParentHandlers(false);
this.fileHandlers.put(logger, fh);
}
public Logger setupSimpleLog(String logName) {
Logger logger = Logger.getLogger(logName + ".log");
Formatter f = new Formatter() {
public String format(java.util.logging.LogRecord record) {
return ArchiveUtils.getLog17Date(record.getMillis()) + " " + record.getMessage() + '\n';
}
};
ConfigPath logPath = new ConfigPath(logName + ".log", logName + ".log");
logPath.setBase(getPath());
try {
setupLogFile(logger, logPath.getFile().getAbsolutePath(), f, true);
} catch (IOException e) {
throw new IllegalStateException(e);
}
return logger;
}
private void setupAlertLog(String logsPath) throws IOException {
Logger logger = Logger.getLogger(LOGNAME_ALERTS + "." + logsPath);
String filename = getAlertsLogPath().getFile().getAbsolutePath();
GenerationFileHandler fh =
GenerationFileHandler.makeNew(filename, false, true);
fh.setFormatter(new SimpleFormatter());
AlertThreadGroup.current().addLogger(logger);
AlertHandler.ensureStaticInitialization();
logger.addHandler(fh);
addToManifest(filename, MANIFEST_LOG_FILE, true);
logger.setUseParentHandlers(false);
this.fileHandlers.put(logger, fh);
}
public void rotateLogFiles() throws IOException {
rotateLogFiles("." + ArchiveUtils.get14DigitDate());
}
protected void rotateLogFiles(String generationSuffix)
throws IOException {
rotateLogFiles(generationSuffix, false);
}
protected void rotateLogFiles(String generationSuffix, boolean mergeOld)
throws IOException {
for (Logger l : fileHandlers.keySet()) {
GenerationFileHandler gfh = (GenerationFileHandler) fileHandlers.get(l);
GenerationFileHandler newGfh = gfh.rotate(generationSuffix, "", mergeOld);
if (gfh.shouldManifest()) {
addToManifest((String) newGfh.getFilenameSeries().get(1),
MANIFEST_LOG_FILE, newGfh.shouldManifest());
}
l.removeHandler(gfh);
l.addHandler(newGfh);
fileHandlers.put(l, newGfh);
}
}
/**
* Close all log files and remove handlers from loggers.
*/
public void closeLogFiles() {
if (fileHandlers != null) {
for (Logger l: fileHandlers.keySet()) {
GenerationFileHandler gfh =
(GenerationFileHandler)fileHandlers.get(l);
gfh.close();
l.removeHandler(gfh);
}
}
}
/**
* Add a file to the manifest of files used/generated by the current
* crawl.
*
* TODO: Its possible for a file to be added twice if reports are
* force generated midcrawl. Fix.
*
* @param file The filename (with absolute path) of the file to add
* @param type The type of the file
* @param bundle Should the file be included in a typical bundling of
* crawler files.
*
* @see #MANIFEST_CONFIG_FILE
* @see #MANIFEST_LOG_FILE
* @see #MANIFEST_REPORT_FILE
*/
public void addToManifest(String file, char type, boolean bundle) {
manifest.append(type + (bundle? "+": "-") + " " + file + "\n");
}
public void startCheckpoint(Checkpoint checkpointInProgress) {}
/**
* Run checkpointing.
*
* <p>Default access only to be called by Checkpointer.
* @throws Exception
*/
public void doCheckpoint(Checkpoint checkpointInProgress) throws IOException {
// Rotate off crawler logs.
rotateLogFiles("." + checkpointInProgress.getName(),
checkpointInProgress.getForgetAllButLatest());
}
public void finishCheckpoint(Checkpoint checkpointInProgress) {}
protected Checkpoint recoveryCheckpoint;
@Autowired(required=false)
public void setRecoveryCheckpoint(Checkpoint checkpoint) {
this.recoveryCheckpoint = checkpoint;
}
public Logger getNonfatalErrors() {
return nonfatalErrors;
}
public Logger getProgressStats() {
return progressStats;
}
public Logger getRuntimeErrors() {
return runtimeErrors;
}
public Logger getUriErrors() {
return uriErrors;
}
public Logger getUriProcessing() {
return uriProcessing;
}
public int getAlertCount() {
if (atg != null) {
return atg.getAlertCount();
} else {
return -1;
}
}
public void resetAlertCount() {
if (atg != null) {
atg.resetAlertCount();
}
}
/**
* Log a URIException from deep inside other components to the crawl's
* shared log.
*
* @param e URIException encountered
* @param u CrawlURI where problem occurred
* @param l String which could not be interpreted as URI without exception
*/
public void logUriError(URIException e, UURI u, CharSequence l) {
Object[] array = {u, l};
uriErrors.log(Level.INFO, e.getMessage(), array);
}
private void readObject(ObjectInputStream in)
throws IOException, ClassNotFoundException {
in.defaultReadObject();
getPath().getFile().mkdirs();
this.atg = AlertThreadGroup.current();
this.setupLogs();
}
public void afterPropertiesSet() throws Exception {
ConfigPath[] paths = {
crawlLogPath, alertsLogPath, progressLogPath,
uriErrorsLogPath, runtimeErrorsLogPath, nonfatalErrorsLogPath };
for(ConfigPath cp : paths) {
if(cp.getBase()==null) {
cp.setBase(getPath());
}
}
}
}