Package org.archive.crawler.postprocessor

Source Code of org.archive.crawler.postprocessor.LowDiskPauseProcessor

/*
*  This file is part of the Heritrix web crawler (crawler.archive.org).
*
*  Licensed to the Internet Archive (IA) by one or more individual
*  contributors.
*
*  The IA licenses this file to You under the Apache License, Version 2.0
*  (the "License"); you may not use this file except in compliance with
*  the License.  You may obtain a copy of the License at
*
*      http://www.apache.org/licenses/LICENSE-2.0
*
*  Unless required by applicable law or agreed to in writing, software
*  distributed under the License is distributed on an "AS IS" BASIS,
*  WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
*  See the License for the specific language governing permissions and
*  limitations under the License.
*/
package org.archive.crawler.postprocessor;

import java.io.IOException;
import java.util.ArrayList;
import java.util.List;
import java.util.logging.Level;
import java.util.logging.Logger;
import java.util.regex.Matcher;
import java.util.regex.Pattern;

import org.apache.commons.io.IOUtils;
import org.archive.crawler.framework.CrawlController;
import org.archive.modules.ProcessResult;
import org.archive.modules.Processor;
import org.archive.modules.CrawlURI;
import org.springframework.beans.factory.annotation.Autowired;

/**
* Processor module which uses 'df -k', where available and with
* the expected output format (on Linux), to monitor available
* disk space and pause the crawl if free space on  monitored
* filesystems falls below certain thresholds.
*
* @deprecated Is highly system dependant.
*             Use {@link org.archive.crawler.monitor.DiskSpaceMonitor} instead.
*/
@Deprecated
public class LowDiskPauseProcessor extends Processor {

    @SuppressWarnings("unused")
    private static final long serialVersionUID = 3L;

    /**
     * Logger.
     */
    private static final Logger logger =
        Logger.getLogger(LowDiskPauseProcessor.class.getName());


    protected CrawlController controller;
    public CrawlController getCrawlController() {
        return this.controller;
    }
    @Autowired
    public void setCrawlController(CrawlController controller) {
        this.controller = controller;
    }
   
    /**
     * List of filessystem mounts whose 'available' space should be monitored
     * via 'df' (if available).
     */
    protected List<String> monitorMounts = new ArrayList<String>();
    public List<String> getMonitorMounts() {
        return this.monitorMounts;
    }
    public void setMonitorMounts(List<String> monitorMounts) {
        this.monitorMounts = monitorMounts;
    }

    /**
     * When available space on any monitored mounts falls below this threshold,
     * the crawl will be paused.
     */
    protected int pauseThresholdKb = 500*1024; // 500MB
    public int getPauseThresholdKb() {
        return this.pauseThresholdKb;
    }
    public void setPauseThresholdKb(int pauseThresholdKb) {
        this.pauseThresholdKb = pauseThresholdKb;
    }
   
    /**
     * Available space via 'df' is rechecked after every increment of this much
     * content (uncompressed) is observed.
     */
    protected int recheckThresholdKb = 200*1024; // 200MB
    public int getRecheckThresholdKb() {
        return this.recheckThresholdKb;
    }
    public void setRecheckThresholdKb(int recheckThresholdKb) {
        this.recheckThresholdKb = recheckThresholdKb;
    }
   
    protected int contentSinceCheck = 0;
   
    public static final Pattern VALID_DF_OUTPUT =
        Pattern.compile("(?s)^Filesystem\\s+1K-blocks\\s+Used\\s+Available\\s+Use%\\s+Mounted on\\n.*");
    public static final Pattern AVAILABLE_EXTRACTOR =
        Pattern.compile("(?m)\\s(\\d+)\\s+\\d+%\\s+(\\S+)$");
   
    /**
     * @param name Name of this writer.
     */
    public LowDiskPauseProcessor() {
    }
   
   
    @Override
    protected boolean shouldProcess(CrawlURI curi) {
        return true;
    }

    @Override
    protected void innerProcess(CrawlURI uri) {
        throw new AssertionError();
    }
   
    /**
     * Notes a CrawlURI's content size in its running tally. If the
     * recheck increment of content has passed through since the last
     * available-space check, checks available space and pauses the
     * crawl if any monitored mounts are below the configured threshold.
     *
     * @param curi CrawlURI to process.
     */
    @Override
    protected ProcessResult innerProcessResult(CrawlURI curi) {
        synchronized (this) {
            contentSinceCheck += curi.getContentSize();
            if (contentSinceCheck/1024 > getRecheckThresholdKb()) {
                ProcessResult r = checkAvailableSpace(curi);
                contentSinceCheck = 0;
                return r;
            } else {
                return ProcessResult.PROCEED;
            }
        }
    }


    /**
     * Probe via 'df' to see if monitored mounts have fallen
     * below the pause available threshold. If so, request a
     * crawl pause.
     * @param curi Current context.
     */
    private ProcessResult checkAvailableSpace(CrawlURI curi) {
        try {
            String df = IOUtils.toString(Runtime.getRuntime().exec(
                    "df -k").getInputStream());
            Matcher matcher = VALID_DF_OUTPUT.matcher(df);
            if(!matcher.matches()) {
                logger.severe("'df -k' output unacceptable for low-disk checking");
                return ProcessResult.PROCEED;
            }
            List<String> monitoredMounts = getMonitorMounts();
            matcher = AVAILABLE_EXTRACTOR.matcher(df);
            while (matcher.find()) {
                String mount = matcher.group(2);
                if (monitoredMounts.contains(mount)) {
                    long availKilobytes = Long.parseLong(matcher.group(1));
                    int thresholdKilobytes = getPauseThresholdKb();
                    if (availKilobytes < thresholdKilobytes ) {
                        logger.log(Level.SEVERE, "Low Disk Pause",
                                availKilobytes + "K available on " + mount
                                        + " (below threshold "
                                        + thresholdKilobytes + "K)");
                        controller.requestCrawlPause();
                        return ProcessResult.PROCEED;
                    }
                }
            }
        } catch (IOException e) {
            curi.getNonFatalFailures().add(e);
        }
        return ProcessResult.PROCEED;
    }
}
TOP

Related Classes of org.archive.crawler.postprocessor.LowDiskPauseProcessor

TOP
Copyright © 2018 www.massapi.com. All rights reserved.
All source code are property of their respective owners. Java is a trademark of Sun Microsystems, Inc and owned by ORACLE Inc. Contact coftware#gmail.com.