Package org.archive.modules

Source Code of org.archive.modules.Processor

/*
*  This file is part of the Heritrix web crawler (crawler.archive.org).
*
*  Licensed to the Internet Archive (IA) by one or more individual
*  contributors.
*
*  The IA licenses this file to You under the Apache License, Version 2.0
*  (the "License"); you may not use this file except in compliance with
*  the License.  You may obtain a copy of the License at
*
*      http://www.apache.org/licenses/LICENSE-2.0
*
*  Unless required by applicable law or agreed to in writing, software
*  distributed under the License is distributed on an "AS IS" BASIS,
*  WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
*  See the License for the specific language governing permissions and
*  limitations under the License.
*/
package org.archive.modules;


import java.io.IOException;
import java.util.Set;
import java.util.concurrent.atomic.AtomicLong;

import org.apache.commons.httpclient.HttpStatus;
import org.archive.checkpointing.Checkpoint;
import org.archive.checkpointing.Checkpointable;
import org.archive.modules.credential.Credential;
import org.archive.modules.credential.HttpAuthenticationCredential;
import org.archive.modules.deciderules.AcceptDecideRule;
import org.archive.modules.deciderules.DecideResult;
import org.archive.modules.deciderules.DecideRule;
import org.archive.net.UURI;
import org.archive.spring.HasKeyedProperties;
import org.archive.spring.KeyedProperties;
import org.json.JSONException;
import org.json.JSONObject;
import org.springframework.beans.factory.BeanNameAware;
import org.springframework.beans.factory.annotation.Autowired;
import org.springframework.context.Lifecycle;


/**
* A processor of URIs.  The URI provides the context for the process;
* settings can be altered based on the URI.
*
* @author pjack
*/
public abstract class Processor
implements HasKeyedProperties,
           Lifecycle,
           BeanNameAware,
           Checkpointable {
    protected KeyedProperties kp = new KeyedProperties();
    public KeyedProperties getKeyedProperties() {
        return kp;
    }

    protected String beanName;
    public String getBeanName() {
        return this.beanName;
    }
    public void setBeanName(String name) {
        this.beanName = name;
    }
   
    /**
     * Whether or not this process will execute for a particular URI.
     * If this is false for a URI, then the URI isn't processed,
     * regardless of what the DecideRules say.
     */
    {
        setEnabled(true);
    }
    public boolean getEnabled() {
        return (Boolean) kp.get("enabled");
    }
    public void setEnabled(boolean enabled) {
        kp.put("enabled",enabled);
    }
   
   
    /**
     * Decide rule(s) (also particular to a URI) that determine whether
     * or not a particular URI is processed here. If the rule(s) answer
     * REJECT, processing is skipped. (ACCEPT or PASS allow processing
     * to continue).
     */
    {
        setShouldProcessRule(new AcceptDecideRule());
    }
    public DecideRule getShouldProcessRule() {
        return (DecideRule) kp.get("shouldProcessRule");
    }
    public void setShouldProcessRule(DecideRule rule) {
        kp.put("shouldProcessRule", rule);
    }

    /**
     * The number of URIs processed by this processor.
     */
    protected AtomicLong uriCount = new AtomicLong(0);

   
    /**
     * Processes the given URI.  First checks {@link #ENABLED} and
     * {@link #DECIDE_RULES}.  If ENABLED is false, then nothing happens.
     * If the DECIDE_RULES indicate REJECT, then the
     * {@link #innerRejectProcess(ProcessorURI)} method is invoked, and
     * the process method returns.
     *
     * <p>Next, the {@link #shouldProcess(ProcessorURI)} method is
     * consulted to see if this Processor knows how to handle the given
     * URI.  If it returns false, then nothing futher occurs.
     *
     * <p>FIXME: Should innerRejectProcess be called when ENABLED is false,
     * or when shouldProcess returns false?  The previous Processor
     * implementation didn't handle it that way.
     *
     * <p>Otherwise, the URI is considered valid.  This processor's count
     * of handled URIs is incremented, and the
     * {@link #innerProcess(ProcessorURI)} method is invoked to actually
     * perform the process.
     *
     * @param uri  The URI to process
     * @throws  InterruptedException   if the thread is interrupted
     */
    public ProcessResult process(CrawlURI uri)
    throws InterruptedException {
        if (!getEnabled()) {
            return ProcessResult.PROCEED;
        }
       
        if (getShouldProcessRule().decisionFor(uri) == DecideResult.REJECT) {
            innerRejectProcess(uri);
            return ProcessResult.PROCEED;
        }
       
        if (shouldProcess(uri)) {
            uriCount.incrementAndGet();
            return innerProcessResult(uri);
        } else {
            return ProcessResult.PROCEED;
        }
    }

    /**
     * Returns the number of URIs this processor has handled.  The returned
     * number does not include URIs that were rejected by the
     * {@link #ENABLED} flag, by the {@link #DECIDE_RULES}, or by the
     * {@link #shouldProcess(ProcessorURI)} method.
     *
     * @return  the number of URIs this processor has handled
     */
    public long getURICount() {
        return uriCount.get();
    }


    /**
     * Determines whether the given uri should be processed by this
     * processor.  For instance, a processor that only works on HTML
     * content might reject the URI if its content type is not
     * "text/html", if its content length is zero, and so on.
     *
     * @param uri   the URI to test
     * @return  true if this processor should process that uri; false if not
     */
    protected abstract boolean shouldProcess(CrawlURI uri);

   
    protected ProcessResult innerProcessResult(CrawlURI uri)
    throws InterruptedException {
        innerProcess(uri);
        return ProcessResult.PROCEED;
    }

    /**
     * Actually performs the process.  By the time this method is invoked,
     * it is known that the given URI passes the {@link #ENABLED}, the
     * {@link #DECIDE_RULES} and the {@link #shouldProcess(ProcessorURI)}
     * tests. 
     *
     * @param uri    the URI to process
     * @throws InterruptedException   if the thread is interrupted
     */
    protected abstract void innerProcess(CrawlURI uri)
    throws InterruptedException;


    /**
     * Invoked after a URI has been rejected.  The default implementation
     * does nothing; subclasses may override to log rejects or something.
     *
     * @param uri   the URI that was rejected
     * @throws InterruptedException   if the thread is interrupted
     */
    protected void innerRejectProcess(CrawlURI uri)
    throws InterruptedException {       
    }


    public static String flattenVia(CrawlURI puri) {
        UURI uuri = puri.getVia();
        return (uuri == null) ? "" : uuri.toString();
    }

   
    public static boolean isSuccess(CrawlURI puri) {
        boolean result = false;
        int statusCode = puri.getFetchStatus();
        if (statusCode == HttpStatus.SC_UNAUTHORIZED &&
            hasHttpAuthenticationCredential(puri)) {
            result = false;
        } else {
            result = (statusCode > 0);
        }
        return result;       
    }
   
   
    public static long getRecordedSize(CrawlURI puri) {
        if (puri.getRecorder() == null) {
            return puri.getContentSize();
        } else {
            return puri.getRecorder().getRecordedInput().getSize();
        }
    }
   

    /**
     * @return True if we have an HttpAuthentication (rfc2617) payload.
     */
    public static boolean hasHttpAuthenticationCredential(CrawlURI puri) {
        Set<Credential> credentials = puri.getCredentials();
        for (Credential ca: credentials) {
            if (ca instanceof HttpAuthenticationCredential) {
                return true;
            }
        }
        return false;
    }

    // FIXME: Raise to interface
    // FIXME: Internationalize somehow
    // FIXME: Pass in PrintWriter instead creating large in-memory strings
    public String report() {
        return "Processor: "+getClass().getName()+"\n";
    }
   
    protected boolean isRunning = false;
    public boolean isRunning() {
        return isRunning;
    }

    public void start() {
        if(isRunning) {
            return;
        }
        isRunning = true;
        if(recoveryCheckpoint!=null) {
            try {
                JSONObject json = recoveryCheckpoint.loadJson(getBeanName());
                fromCheckpointJson(json);
            } catch (JSONException e) {
                throw new RuntimeException(e);
            }
        }
    }


    public void stop() {
        isRunning = false;
    }
   
    public void startCheckpoint(Checkpoint checkpointInProgress) {}
   
    public void doCheckpoint(Checkpoint checkpointInProgress)
    throws IOException {
        try {
            JSONObject json = toCheckpointJson();
            checkpointInProgress.saveJson(beanName, json);
        } catch(JSONException j) {
            // impossible
        }
    }
 
    /**
     * Return a JSONObject of current stat that can be consulted
     * on recovery to restore necessary values.
     *
     * @return JSONObject
     * @throws JSONException
     */
    protected JSONObject toCheckpointJson() throws JSONException {
        JSONObject json = new JSONObject();
        json.put("uriCount", getURICount());
        return json;
    }
   
    /**
     * Restore internal state from JSONObject stored at earlier
     * checkpoint-time.
     *
     * @param json JSONObject
     * @throws JSONException
     */
    protected void fromCheckpointJson(JSONObject json) throws JSONException {
        uriCount.set(json.getLong("uriCount"));
    }
   
    public void finishCheckpoint(Checkpoint checkpointInProgress) {}
   
    protected Checkpoint recoveryCheckpoint;
    @Autowired(required=false)
    public void setRecoveryCheckpoint(Checkpoint checkpoint) {
        this.recoveryCheckpoint = checkpoint;
    }
}
TOP

Related Classes of org.archive.modules.Processor

TOP
Copyright © 2018 www.massapi.com. All rights reserved.
All source code are property of their respective owners. Java is a trademark of Sun Microsystems, Inc and owned by ORACLE Inc. Contact coftware#gmail.com.