Examples of RobotRules


Examples of org.apache.nutch.protocol.RobotRules

          try {
            LOG.info("fetching " + fit.url);

            // fetch the page
            final Protocol protocol = this.protocolFactory.getProtocol(fit.url);
            final RobotRules rules = protocol.getRobotRules(fit.url, fit.page);
            if (!rules.isAllowed(fit.u)) {
              // unblock
              fetchQueues.finishFetchItem(fit, true);
              if (LOG.isDebugEnabled()) {
                LOG.debug("Denied by robots.txt: " + fit.url);
              }
              output(fit, null, ProtocolStatusUtils.STATUS_ROBOTS_DENIED,
                  CrawlStatus.STATUS_GONE);
              continue;
            }
            if (rules.getCrawlDelay() > 0) {
              if (rules.getCrawlDelay() > maxCrawlDelay) {
                // unblock
                fetchQueues.finishFetchItem(fit, true);
                LOG.debug("Crawl-Delay for " + fit.url + " too long (" + rules.getCrawlDelay() + "), skipping");
                output(fit, null, ProtocolStatusUtils.STATUS_ROBOTS_DENIED, CrawlStatus.STATUS_GONE);
                continue;
              } else {
                final FetchItemQueue fiq = fetchQueues.getFetchItemQueue(fit.queueID);
                fiq.crawlDelay = rules.getCrawlDelay();
              }
            }
            final ProtocolOutput output = protocol.getProtocolOutput(fit.url, fit.page);
            final ProtocolStatus status = output.getStatus();
            final Content content = output.getContent();
View Full Code Here

Examples of org.apache.nutch.protocol.RobotRules

   * @see org.apache.nutch.protocol.Protocol#getRobotRules(java.lang.String,
   * org.apache.nutch.storage.WebPage)
   */
  @Override
  public RobotRules getRobotRules(String url, WebPage page) {
    return new RobotRules() {

      @Override
      public boolean isAllowed(URL url) {
        // they're all allowed for now.
        return true;
View Full Code Here

Examples of org.apache.nutch.protocol.RobotRules

              if (LOG.isDebugEnabled()) {
                LOG.debug("redirectCount=" + redirectCount);
              }
              redirecting = false;
              final Protocol protocol = this.protocolFactory.getProtocol(fit.url);
              final RobotRules rules = protocol.getRobotRules(fit.url, fit.page);
              if (!rules.isAllowed(fit.u)) {
                // unblock
                fetchQueues.finishFetchItem(fit, true);
                if (LOG.isDebugEnabled()) {
                  LOG.debug("Denied by robots.txt: " + fit.url);
                }
                output(fit, null, ProtocolStatusUtils.STATUS_ROBOTS_DENIED,
                    CrawlStatus.STATUS_GONE);
                continue;
              }
              if (rules.getCrawlDelay() > 0) {
                if (rules.getCrawlDelay() > maxCrawlDelay) {
                  // unblock
                  fetchQueues.finishFetchItem(fit, true);
                  LOG.debug("Crawl-Delay for " + fit.url + " too long (" + rules.getCrawlDelay() + "), skipping");
                  output(fit, null, ProtocolStatusUtils.STATUS_ROBOTS_DENIED, CrawlStatus.STATUS_GONE);
                  continue;
                } else {
                  final FetchItemQueue fiq = fetchQueues.getFetchItemQueue(fit.queueID);
                  fiq.crawlDelay = rules.getCrawlDelay();
                }
              }
              final ProtocolOutput output = protocol.getProtocolOutput(fit.url, fit.page);
              final ProtocolStatus status = output.getStatus();
              final Content content = output.getContent();
View Full Code Here

Examples of org.apache.nutch.protocol.RobotRules

   * @see org.apache.nutch.protocol.Protocol#getRobotRules(java.lang.String,
   * org.apache.nutch.storage.WebPage)
   */
  @Override
  public RobotRules getRobotRules(String url, WebPage page) {
    return new RobotRules() {

      @Override
      public boolean isAllowed(URL url) {
        // they're all allowed for now.
        return true;
View Full Code Here
TOP
Copyright © 2018 www.massapi.com. All rights reserved.
All source code are property of their respective owners. Java is a trademark of Sun Microsystems, Inc and owned by ORACLE Inc. Contact coftware#gmail.com.