Examples of BaseRobotRules


Examples of crawlercommons.robots.BaseRobotRules

            LOG.info("fetching " + fit.url + " (queue crawl delay=" +
                      fetchQueues.getFetchItemQueue(fit.queueID).crawlDelay + "ms)");

            // fetch the page
            final Protocol protocol = this.protocolFactory.getProtocol(fit.url);
            final BaseRobotRules rules = protocol.getRobotRules(fit.url, fit.page);
            if (!rules.isAllowed(fit.u.toString())) {
              // unblock
              fetchQueues.finishFetchItem(fit, true);
              if (LOG.isDebugEnabled()) {
                LOG.debug("Denied by robots.txt: " + fit.url);
              }
              output(fit, null, ProtocolStatusUtils.STATUS_ROBOTS_DENIED,
                  CrawlStatus.STATUS_GONE);
              continue;
            }
            if (rules.getCrawlDelay() > 0) {
              if (rules.getCrawlDelay() > maxCrawlDelay && maxCrawlDelay >= 0) {
                // unblock
                fetchQueues.finishFetchItem(fit, true);
                LOG.debug("Crawl-Delay for " + fit.url + " too long (" + rules.getCrawlDelay() + "), skipping");
                output(fit, null, ProtocolStatusUtils.STATUS_ROBOTS_DENIED, CrawlStatus.STATUS_GONE);
                continue;
              } else {
                final FetchItemQueue fiq = fetchQueues.getFetchItemQueue(fit.queueID);
                fiq.crawlDelay = rules.getCrawlDelay();
                if (LOG.isDebugEnabled()) {
                  LOG.info("Crawl delay for queue: " + fit.queueID + " is set to " + fiq.crawlDelay + " as per robots.txt. url: " + fit.url);
                }
              }
            }
View Full Code Here

Examples of crawlercommons.robots.BaseRobotRules

        agentNames.append(argv[counter]).append(",");

      agentNames.deleteCharAt(agentNames.length()-1);

      byte[] robotsBytes = Files.toByteArray(new File(argv[0]));
      BaseRobotRules rules = robotParser.parseContent(argv[0], robotsBytes, "text/plain", agentNames.toString());

      LineNumberReader testsIn = new LineNumberReader(new FileReader(argv[1]));
      String testPath = testsIn.readLine().trim();
      while (testPath != null) {
        System.out.println( (rules.isAllowed(testPath) ? "allowed" : "not allowed") +
            ":\t" + testPath);
        testPath = testsIn.readLine();
      }
      testsIn.close();
    } catch (Exception e) {
View Full Code Here

Examples of crawlercommons.robots.BaseRobotRules

  public BaseRobotRules getRobotRulesSet(Protocol http, URL url) {

    String protocol = url.getProtocol().toLowerCase()// normalize to lower case
    String host = url.getHost().toLowerCase();          // normalize to lower case

    BaseRobotRules robotRules = (SimpleRobotRules)CACHE.get(protocol + ":" + host);

    boolean cacheRule = true;
   
    if (robotRules == null) {                     // cache miss
      URL redir = null;
View Full Code Here

Examples of crawlercommons.robots.BaseRobotRules

  public BaseRobotRules getRobotRulesSet(Protocol ftp, URL url) {

    String protocol = url.getProtocol().toLowerCase()// normalize to lower case
    String host = url.getHost().toLowerCase();          // normalize to lower case

    BaseRobotRules robotRules = (SimpleRobotRules) CACHE.get(protocol + ":" + host);

    boolean cacheRule = true;

    if (robotRules == null) {                     // cache miss
View Full Code Here
TOP
Copyright © 2018 www.massapi.com. All rights reserved.
All source code are property of their respective owners. Java is a trademark of Sun Microsystems, Inc and owned by ORACLE Inc. Contact coftware#gmail.com.