Package org.archive.accesscontrol.robotstxt

Source Code of org.archive.accesscontrol.robotstxt.RobotClient

package org.archive.accesscontrol.robotstxt;

import java.io.IOException;
import java.util.Collection;

import org.apache.commons.httpclient.URIException;
import org.archive.accesscontrol.RobotsUnavailableException;
import org.archive.net.LaxURI;

/**
* A client for checking whether a robot is allowed by a robots.txt file.
*
* @author aosborne
*
*/
public abstract class RobotClient {
    /**
     * Returns true if a robot with the given user-agent is allowed to access
     * the given url.
     *
     * @param url
     * @param userAgent
     * @return
     * @throws IOException
     * @throws RobotsUnavailableException
     */
    public boolean isRobotPermitted(String url, String userAgent)
            throws IOException, RobotsUnavailableException {
        RobotRules rules = getRulesForUrl(url, userAgent);
        return !rules.blocksPathForUA(new LaxURI(url, false).getPath(),
                userAgent);
    }

    /**
     * Fetch the applicable ruleset for the given url and robot.
     *
     * @param url
     * @param userAgent
     * @return
     * @throws IOException a local problem occurred when attempting to fetch the robots.txt
     * @throws RobotsUnavailableException a remote problem, we found no robots.txt or the server is down.
     */
    public abstract RobotRules getRulesForUrl(String url, String userAgent)
            throws IOException, RobotsUnavailableException;

    public static String robotsUrlForUrl(String url) throws URIException {
        LaxURI uri = new LaxURI(url, false);
        uri.setPath("/robots.txt");
        uri.setQuery(null);
        uri.setFragment(null);
        return uri.toString();
    }
   
    /**
     * Prepare the cache to lookup info for a given set of urls. The fetches
     * happen in parallel so this also makes a good option for speeding up bulk lookups.
     *
     * This may be a no-op.
     */
    public abstract void prepare(Collection<String> urls, String userAgent);
   
    /**
     * Use a proxy server when fetching robots.txt data.
     * @param host
     * @param port
     */
    public abstract void setRobotProxy(String host, int port);
}
TOP

Related Classes of org.archive.accesscontrol.robotstxt.RobotClient

TOP
Copyright © 2018 www.massapi.com. All rights reserved.
All source code are property of their respective owners. Java is a trademark of Sun Microsystems, Inc and owned by ORACLE Inc. Contact coftware#gmail.com.