Package org.archive.crawler.reporting

Source Code of org.archive.crawler.reporting.CrawlerLoggerModule

/*
*  This file is part of the Heritrix web crawler (crawler.archive.org).
*
*  Licensed to the Internet Archive (IA) by one or more individual
*  contributors.
*
*  The IA licenses this file to You under the Apache License, Version 2.0
*  (the "License"); you may not use this file except in compliance with
*  the License.  You may obtain a copy of the License at
*
*      http://www.apache.org/licenses/LICENSE-2.0
*
*  Unless required by applicable law or agreed to in writing, software
*  distributed under the License is distributed on an "AS IS" BASIS,
*  WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
*  See the License for the specific language governing permissions and
*  limitations under the License.
*/

package org.archive.crawler.reporting;

import java.io.File;
import java.io.IOException;
import java.io.ObjectInputStream;
import java.util.HashMap;
import java.util.Map;
import java.util.logging.FileHandler;
import java.util.logging.Formatter;
import java.util.logging.Level;
import java.util.logging.Logger;
import java.util.logging.SimpleFormatter;

import org.apache.commons.httpclient.URIException;
import org.archive.checkpointing.Checkpoint;
import org.archive.checkpointing.Checkpointable;
import org.archive.crawler.framework.Engine;
import org.archive.crawler.io.NonFatalErrorFormatter;
import org.archive.crawler.io.RuntimeErrorFormatter;
import org.archive.crawler.io.StatisticsLogFormatter;
import org.archive.crawler.io.UriErrorFormatter;
import org.archive.crawler.io.UriProcessingFormatter;
import org.archive.crawler.util.Logs;
import org.archive.io.GenerationFileHandler;
import org.archive.modules.SimpleFileLoggerProvider;
import org.archive.modules.extractor.UriErrorLoggerModule;
import org.archive.net.UURI;
import org.archive.spring.ConfigPath;
import org.archive.util.ArchiveUtils;
import org.archive.util.FileUtils;
import org.springframework.beans.factory.DisposableBean;
import org.springframework.beans.factory.InitializingBean;
import org.springframework.beans.factory.annotation.Autowired;
import org.springframework.context.Lifecycle;

/**
* Module providing all expected whole-crawl logging facilities
*
* @contributor pjack
* @contributor gojomo
*/
public class CrawlerLoggerModule
    implements
        UriErrorLoggerModule, Lifecycle, InitializingBean,
        Checkpointable, SimpleFileLoggerProvider, DisposableBean {
    @SuppressWarnings("unused")
    private static final long serialVersionUID = 1L;

    protected ConfigPath path = new ConfigPath(Engine.LOGS_DIR_NAME,"${launchId}/logs");
    public ConfigPath getPath() {
        return path;
    }
    public void setPath(ConfigPath cp) {
        this.path.merge(cp);
    }

    /**
     * Whether to include the "extra info" field for each entry in crawl.log.
     * "Extra info" is arbitrary JSON. It is the last field of the log line.
     */
    protected boolean logExtraInfo = false;
    public boolean getLogExtraInfo() {
        return logExtraInfo;
    }
    public void setLogExtraInfo(boolean logExtraInfo) {
        this.logExtraInfo = logExtraInfo;
    }
   
    // manifest support
    /** abbreviation label for config files in manifest */
    public static final char MANIFEST_CONFIG_FILE = 'C';
    /** abbreviation label for report files in manifest */
    public static final char MANIFEST_REPORT_FILE = 'R';
    /** abbreviation label for log files in manifest */
    public static final char MANIFEST_LOG_FILE = 'L';
   
    // key log names
    private static final String LOGNAME_CRAWL = "crawl";
    private static final String LOGNAME_ALERTS = "alerts";
    private static final String LOGNAME_PROGRESS_STATISTICS =
        "progress-statistics";
    private static final String LOGNAME_URI_ERRORS = "uri-errors";
    private static final String LOGNAME_RUNTIME_ERRORS = "runtime-errors";
    private static final String LOGNAME_NONFATAL_ERRORS = "nonfatal-errors";


    protected ConfigPath crawlLogPath =
        new ConfigPath(Logs.CRAWL.getFilename(),Logs.CRAWL.getFilename());
    public ConfigPath getCrawlLogPath() {
        return crawlLogPath;
    }
    public void setCrawlLogPath(ConfigPath cp) {
        this.crawlLogPath.merge(cp);
    }
   
    protected ConfigPath alertsLogPath =
        new ConfigPath(Logs.ALERTS.getFilename(),Logs.ALERTS.getFilename());
    public ConfigPath getAlertsLogPath() {
        return alertsLogPath;
    }
    public void setAlertsLogPath(ConfigPath cp) {
        this.alertsLogPath.merge(cp);
    }
   
    protected ConfigPath progressLogPath =
        new ConfigPath(Logs.PROGRESS_STATISTICS.getFilename(),Logs.PROGRESS_STATISTICS.getFilename());
    public ConfigPath getProgressLogPath() {
        return progressLogPath;
    }
    public void setProgressLogPath(ConfigPath cp) {
        this.progressLogPath.merge(cp);
    }
   
    protected ConfigPath uriErrorsLogPath =
        new ConfigPath(Logs.URI_ERRORS.getFilename(),Logs.URI_ERRORS.getFilename());
    public ConfigPath getUriErrorsLogPath() {
        return uriErrorsLogPath;
    }
    public void setUriErrorsLogPath(ConfigPath cp) {
        this.uriErrorsLogPath.merge(cp);
    }
   
    protected ConfigPath runtimeErrorsLogPath =
        new ConfigPath(Logs.RUNTIME_ERRORS.getFilename(),Logs.RUNTIME_ERRORS.getFilename());
    public ConfigPath getRuntimeErrorsLogPath() {
        return runtimeErrorsLogPath;
    }
    public void setRuntimeErrorsLogPath(ConfigPath cp) {
        this.runtimeErrorsLogPath.merge(cp);
    }
   
    protected ConfigPath nonfatalErrorsLogPath =
        new ConfigPath(Logs.NONFATAL_ERRORS.getFilename(),Logs.NONFATAL_ERRORS.getFilename());
    public ConfigPath getNonfatalErrorsLogPath() {
        return nonfatalErrorsLogPath;
    }
    public void setNonfatalErrorsLogPath(ConfigPath cp) {
        this.nonfatalErrorsLogPath.merge(cp);
    }
   
    /** suffix to use on active logs */
//    public static final String CURRENT_LOG_SUFFIX = ".log";
   
    /**
     * Crawl progress logger.
     *
     * No exceptions.  Logs summary result of each url processing.
     */
    private transient Logger uriProcessing;

    /**
     * This logger contains unexpected runtime errors.
     *
     * Would contain errors trying to set up a job or failures inside
     * processors that they are not prepared to recover from.
     */
    private transient Logger runtimeErrors;

    /**
     * This logger is for job-scoped logging, specifically recoverable
     * errors which happen and are handled within a particular processor.
     *
     * Examples would be socket timeouts, exceptions thrown by
     * extractors, etc.
     */
    private transient Logger nonfatalErrors;

    /**
     * Special log for URI format problems, wherever they may occur.
     */
    private transient Logger uriErrors;

    /**
     * Statistics tracker writes here at regular intervals.
     */
    private transient Logger progressStats;

    /**
     * Record of fileHandlers established for loggers,
     * assisting file rotation.
     */
    transient private Map<Logger,FileHandler> fileHandlers;

    private StringBuffer manifest = new StringBuffer();
   
    private transient AlertThreadGroup atg;

    public CrawlerLoggerModule() {
       
    }
   
    public void start() {
        if(isRunning) {
            return;
        }
        this.atg = AlertThreadGroup.current();
        try {
            FileUtils.ensureWriteableDirectory(getPath().getFile());
            setupLogs();
        } catch (IOException e) {
            throw new IllegalStateException(e);
        }
        isRunning = true;
    }
   
    protected boolean isRunning = false;
    public boolean isRunning() {
        return this.isRunning;
    }
   
    public void stop() {
        isRunning = false;
    }
   
    public void destroy() {
        closeLogFiles();
    }
   
    protected void setupLogs() throws IOException {
        String logsPath = getPath().getFile().getAbsolutePath() + File.separatorChar;
        uriProcessing = Logger.getLogger(LOGNAME_CRAWL + "." + logsPath);
        runtimeErrors = Logger.getLogger(LOGNAME_RUNTIME_ERRORS + "." +
            logsPath);
        nonfatalErrors = Logger.getLogger(LOGNAME_NONFATAL_ERRORS + "." + logsPath);
        uriErrors = Logger.getLogger(LOGNAME_URI_ERRORS + "." + logsPath);
        progressStats = Logger.getLogger(LOGNAME_PROGRESS_STATISTICS + "." +
            logsPath);

        this.fileHandlers = new HashMap<Logger,FileHandler>();
        setupLogFile(uriProcessing,
            getCrawlLogPath().getFile().getAbsolutePath(),
            new UriProcessingFormatter(getLogExtraInfo()), true);

        setupLogFile(runtimeErrors,
            getRuntimeErrorsLogPath().getFile().getAbsolutePath(),
            new RuntimeErrorFormatter(getLogExtraInfo()), true);

        setupLogFile(nonfatalErrors,
            getNonfatalErrorsLogPath().getFile().getAbsolutePath(),
            new NonFatalErrorFormatter(getLogExtraInfo()), true);

        setupLogFile(uriErrors,
            getUriErrorsLogPath().getFile().getAbsolutePath(),
            new UriErrorFormatter(), true);

        setupLogFile(progressStats,
            getProgressLogPath().getFile().getAbsolutePath(),
            new StatisticsLogFormatter(), true);

        setupAlertLog(logsPath);
    }

    private void setupLogFile(Logger logger, String filename, Formatter f,
            boolean shouldManifest) throws IOException, SecurityException {
        logger.setLevel(Level.INFO); // set all standard loggers to INFO
        GenerationFileHandler fh = GenerationFileHandler.makeNew(filename, false,
            shouldManifest);
        fh.setFormatter(f);
        logger.addHandler(fh);
        addToManifest(filename, MANIFEST_LOG_FILE, shouldManifest);
        logger.setUseParentHandlers(false);
        this.fileHandlers.put(logger, fh);
    }
   
    public Logger setupSimpleLog(String logName) {
        Logger logger = Logger.getLogger(logName + ".log");
       
        Formatter f = new Formatter() {
            public String format(java.util.logging.LogRecord record) {
                return ArchiveUtils.getLog17Date(record.getMillis()) + " " + record.getMessage() + '\n';
            }
        };

        ConfigPath logPath = new ConfigPath(logName + ".log", logName + ".log");
        logPath.setBase(getPath());
        try {
            setupLogFile(logger, logPath.getFile().getAbsolutePath(), f, true);
        } catch (IOException e) {
            throw new IllegalStateException(e);
        }
       
        return logger;
    }

    private void setupAlertLog(String logsPath) throws IOException {
        Logger logger = Logger.getLogger(LOGNAME_ALERTS + "." + logsPath);
        String filename = getAlertsLogPath().getFile().getAbsolutePath();
        GenerationFileHandler fh =
            GenerationFileHandler.makeNew(filename, false, true);
        fh.setFormatter(new SimpleFormatter());
        AlertThreadGroup.current().addLogger(logger);
        AlertHandler.ensureStaticInitialization();
        logger.addHandler(fh);
        addToManifest(filename, MANIFEST_LOG_FILE, true);
        logger.setUseParentHandlers(false);
        this.fileHandlers.put(logger, fh)
    }

   
    public void rotateLogFiles() throws IOException {
        rotateLogFiles("." + ArchiveUtils.get14DigitDate());
    }
   
    protected void rotateLogFiles(String generationSuffix)
    throws IOException {
        rotateLogFiles(generationSuffix, false);
    }

    protected void rotateLogFiles(String generationSuffix, boolean mergeOld)
            throws IOException {
        for (Logger l : fileHandlers.keySet()) {
            GenerationFileHandler gfh = (GenerationFileHandler) fileHandlers.get(l);
            GenerationFileHandler newGfh = gfh.rotate(generationSuffix, "", mergeOld);
           
            if (gfh.shouldManifest()) {
                addToManifest((String) newGfh.getFilenameSeries().get(1),
                        MANIFEST_LOG_FILE, newGfh.shouldManifest());
            }
           
            l.removeHandler(gfh);
            l.addHandler(newGfh);
            fileHandlers.put(l, newGfh);
        }
    }
   
    /**
     * Close all log files and remove handlers from loggers.
     */
    public void closeLogFiles() {
        if (fileHandlers != null) {
            for (Logger l: fileHandlers.keySet()) {
                GenerationFileHandler gfh =
                        (GenerationFileHandler)fileHandlers.get(l);
                gfh.close();
                l.removeHandler(gfh);
            }
        }
    }

   
    /**
     * Add a file to the manifest of files used/generated by the current
     * crawl.
     *
     * TODO: Its possible for a file to be added twice if reports are
     * force generated midcrawl.  Fix.
     *
     * @param file The filename (with absolute path) of the file to add
     * @param type The type of the file
     * @param bundle Should the file be included in a typical bundling of
     *           crawler files.
     *
     * @see #MANIFEST_CONFIG_FILE
     * @see #MANIFEST_LOG_FILE
     * @see #MANIFEST_REPORT_FILE
     */
    public void addToManifest(String file, char type, boolean bundle) {
        manifest.append(type + (bundle? "+": "-") + " " + file + "\n");
    }
   
    public void startCheckpoint(Checkpoint checkpointInProgress) {}

    /**
     * Run checkpointing.
     *
     * <p>Default access only to be called by Checkpointer.
     * @throws Exception
     */
    public void doCheckpoint(Checkpoint checkpointInProgress) throws IOException {
        // Rotate off crawler logs.
        rotateLogFiles("." + checkpointInProgress.getName(),
                checkpointInProgress.getForgetAllButLatest());
    }

    public void finishCheckpoint(Checkpoint checkpointInProgress) {}

    protected Checkpoint recoveryCheckpoint;
    @Autowired(required=false)
    public void setRecoveryCheckpoint(Checkpoint checkpoint) {
        this.recoveryCheckpoint = checkpoint;
    }
   
    public Logger getNonfatalErrors() {
        return nonfatalErrors;
    }


    public Logger getProgressStats() {
        return progressStats;
    }

    public Logger getRuntimeErrors() {
        return runtimeErrors;
    }


    public Logger getUriErrors() {
        return uriErrors;
    }

    public Logger getUriProcessing() {
        return uriProcessing;
    }
   
    public int getAlertCount() {
        if (atg != null) {
            return atg.getAlertCount();
        } else {
            return -1;
        }
    }
   
    public void resetAlertCount() {
        if (atg != null) {
            atg.resetAlertCount();
        }
    }


    /**
     * Log a URIException from deep inside other components to the crawl's
     * shared log.
     *
     * @param e URIException encountered
     * @param u CrawlURI where problem occurred
     * @param l String which could not be interpreted as URI without exception
     */
    public void logUriError(URIException e, UURI u, CharSequence l) {
        Object[] array = {u, l};
        uriErrors.log(Level.INFO, e.getMessage(), array);
    }
   
   
    private void readObject(ObjectInputStream in)
    throws IOException, ClassNotFoundException {
        in.defaultReadObject();
        getPath().getFile().mkdirs();
        this.atg = AlertThreadGroup.current();
        this.setupLogs();
    }
   
   
    public void afterPropertiesSet() throws Exception {
        ConfigPath[] paths = {
                crawlLogPath, alertsLogPath, progressLogPath,
                uriErrorsLogPath, runtimeErrorsLogPath, nonfatalErrorsLogPath };
        for(ConfigPath cp : paths) {
            if(cp.getBase()==null) {
                cp.setBase(getPath());
            }
        }
    }
}
TOP

Related Classes of org.archive.crawler.reporting.CrawlerLoggerModule

TOP
Copyright © 2018 www.massapi.com. All rights reserved.
All source code are property of their respective owners. Java is a trademark of Sun Microsystems, Inc and owned by ORACLE Inc. Contact coftware#gmail.com.