Examples of crawlercommons.robots.BaseRobotRules

crawlercommons.robots.BaseRobotRules

   * @return {@link BaseRobotRules} holding the rules from robots.txt
   */
  public BaseRobotRules getRobotRulesSet(Protocol http, URL url) {


    String cacheKey = getCacheKey(url);
    BaseRobotRules robotRules = (SimpleRobotRules) CACHE.get(cacheKey);


    boolean cacheRule = true;
    
    if (robotRules == null) {                     // cache miss
      URL redir = null;

View Full Code Here

      System.exit(-1);
    }


    try {
      byte[] robotsBytes = Files.toByteArray(new File(argv[0]));
      BaseRobotRules rules = robotParser.parseContent(argv[0], robotsBytes, "text/plain", argv[2]);


      LineNumberReader testsIn = new LineNumberReader(new FileReader(argv[1]));
      String testPath = testsIn.readLine().trim();
      while (testPath != null) {
        System.out.println( (rules.isAllowed(testPath) ? "allowed" : "not allowed") + ":\t" + testPath);
        testPath = testsIn.readLine();
      }
      testsIn.close();
    } catch (Exception e) {
      e.printStackTrace();

View Full Code Here

  public BaseRobotRules getRobotRulesSet(Protocol ftp, URL url) {


    String protocol = url.getProtocol().toLowerCase();  // normalize to lower case
    String host = url.getHost().toLowerCase();          // normalize to lower case


    BaseRobotRules robotRules = (SimpleRobotRules) CACHE.get(protocol + ":" + host);


    boolean cacheRule = true;


    if (robotRules == null) {                     // cache miss
      if (LOG.isTraceEnabled())

View Full Code Here

              if (LOG.isDebugEnabled()) {
                LOG.debug("redirectCount=" + redirectCount);
              }
              redirecting = false;
              Protocol protocol = this.protocolFactory.getProtocol(fit.url.toString());
              BaseRobotRules rules = protocol.getRobotRules(fit.url, fit.datum);
              if (!rules.isAllowed(fit.u.toString())) {
                // unblock
                fetchQueues.finishFetchItem(fit, true);
                if (LOG.isDebugEnabled()) {
                  LOG.debug("Denied by robots.txt: " + fit.url);
                }
                output(fit.url, fit.datum, null, ProtocolStatus.STATUS_ROBOTS_DENIED, CrawlDatum.STATUS_FETCH_GONE);
                reporter.incrCounter("FetcherStatus", "robots_denied", 1);
                continue;
              }
              if (rules.getCrawlDelay() > 0) {
                if (rules.getCrawlDelay() > maxCrawlDelay && maxCrawlDelay >= 0) {
                  // unblock
                  fetchQueues.finishFetchItem(fit, true);
                  LOG.debug("Crawl-Delay for " + fit.url + " too long (" + rules.getCrawlDelay() + "), skipping");
                  output(fit.url, fit.datum, null, ProtocolStatus.STATUS_ROBOTS_DENIED, CrawlDatum.STATUS_FETCH_GONE);
                  reporter.incrCounter("FetcherStatus", "robots_denied_maxcrawldelay", 1);
                  continue;
                } else {
                  FetchItemQueue fiq = fetchQueues.getFetchItemQueue(fit.queueID);
                  fiq.crawlDelay = rules.getCrawlDelay();
                  if (LOG.isDebugEnabled()) {
                    LOG.info("Crawl delay for queue: " + fit.queueID + " is set to " + fiq.crawlDelay + " as per robots.txt. url: " + fit.url);
                  }
                }
              }

View Full Code Here

        agentNames.append(argv[counter]).append(",");


      agentNames.deleteCharAt(agentNames.length()-1);


      byte[] robotsBytes = Files.toByteArray(new File(argv[0]));
      BaseRobotRules rules = robotParser.parseContent(argv[0], robotsBytes, "text/plain", agentNames.toString());


      LineNumberReader testsIn = new LineNumberReader(new FileReader(argv[1]));
      String testPath = testsIn.readLine().trim();
      while (testPath != null) {
        System.out.println( (rules.isAllowed(testPath) ? "allowed" : "not allowed") +
            ":\t" + testPath);
        testPath = testsIn.readLine();
      }
      testsIn.close();
    } catch (Exception e) {

View Full Code Here

        agentNames.append(argv[counter]).append(",");


      agentNames.deleteCharAt(agentNames.length()-1);


      byte[] robotsBytes = Files.toByteArray(new File(argv[0]));
      BaseRobotRules rules = robotParser.parseContent(argv[0], robotsBytes, "text/plain", agentNames.toString());


      LineNumberReader testsIn = new LineNumberReader(new FileReader(argv[1]));
      String testPath = testsIn.readLine().trim();
      while (testPath != null) {
        System.out.println( (rules.isAllowed(testPath) ? "allowed" : "not allowed") +
            ":\t" + testPath);
        testPath = testsIn.readLine();
      }
      testsIn.close();
    } catch (Exception e) {

View Full Code Here

              if (LOG.isDebugEnabled()) {
                LOG.debug("redirectCount=" + redirectCount);
              }
              redirecting = false;
              Protocol protocol = this.protocolFactory.getProtocol(fit.url.toString());
              BaseRobotRules rules = protocol.getRobotRules(fit.url, fit.datum);
              if (!rules.isAllowed(fit.u.toString())) {
                // unblock
                fetchQueues.finishFetchItem(fit, true);
                if (LOG.isDebugEnabled()) {
                  LOG.debug("Denied by robots.txt: " + fit.url);
                }
                output(fit.url, fit.datum, null, ProtocolStatus.STATUS_ROBOTS_DENIED, CrawlDatum.STATUS_FETCH_GONE);
                reporter.incrCounter("FetcherStatus", "robots_denied", 1);
                continue;
              }
              if (rules.getCrawlDelay() > 0) {
                if (rules.getCrawlDelay() > maxCrawlDelay && maxCrawlDelay >= 0) {
                  // unblock
                  fetchQueues.finishFetchItem(fit, true);
                  LOG.debug("Crawl-Delay for " + fit.url + " too long (" + rules.getCrawlDelay() + "), skipping");
                  output(fit.url, fit.datum, null, ProtocolStatus.STATUS_ROBOTS_DENIED, CrawlDatum.STATUS_FETCH_GONE);
                  reporter.incrCounter("FetcherStatus", "robots_denied_maxcrawldelay", 1);
                  continue;
                } else {
                  FetchItemQueue fiq = fetchQueues.getFetchItemQueue(fit.queueID);
                  fiq.crawlDelay = rules.getCrawlDelay();
                  if (LOG.isDebugEnabled()) {
                    LOG.info("Crawl delay for queue: " + fit.queueID + " is set to " + fiq.crawlDelay + " as per robots.txt. url: " + fit.url);
                  }
                }
              }

View Full Code Here

  public BaseRobotRules getRobotRulesSet(Protocol http, URL url) {


    String protocol = url.getProtocol().toLowerCase();  // normalize to lower case
    String host = url.getHost().toLowerCase();          // normalize to lower case


    BaseRobotRules robotRules = (SimpleRobotRules)CACHE.get(protocol + ":" + host);


    boolean cacheRule = true;
    
    if (robotRules == null) {                     // cache miss
      URL redir = null;

View Full Code Here

            LOG.info("fetching " + fit.url + " (queue crawl delay=" + 
                      fetchQueues.getFetchItemQueue(fit.queueID).crawlDelay + "ms)"); 


            // fetch the page
            final Protocol protocol = this.protocolFactory.getProtocol(fit.url);
            final BaseRobotRules rules = protocol.getRobotRules(fit.url, fit.page);
            if (!rules.isAllowed(fit.u.toString())) {
              // unblock
              fetchQueues.finishFetchItem(fit, true);
              if (LOG.isDebugEnabled()) {
                LOG.debug("Denied by robots.txt: " + fit.url);
              }
              output(fit, null, ProtocolStatusUtils.STATUS_ROBOTS_DENIED,
                  CrawlStatus.STATUS_GONE);
              continue;
            }
            if (rules.getCrawlDelay() > 0) {
              if (rules.getCrawlDelay() > maxCrawlDelay && maxCrawlDelay >= 0) {
                // unblock
                fetchQueues.finishFetchItem(fit, true);
                LOG.debug("Crawl-Delay for " + fit.url + " too long (" + rules.getCrawlDelay() + "), skipping");
                output(fit, null, ProtocolStatusUtils.STATUS_ROBOTS_DENIED, CrawlStatus.STATUS_GONE);
                continue;
              } else {
                final FetchItemQueue fiq = fetchQueues.getFetchItemQueue(fit.queueID);
                fiq.crawlDelay = rules.getCrawlDelay();
                if (LOG.isDebugEnabled()) {
                  LOG.info("Crawl delay for queue: " + fit.queueID + " is set to " + fiq.crawlDelay + " as per robots.txt. url: " + fit.url);
                }
              }
            }

View Full Code Here

        agentNames.append(argv[counter]).append(",");


      agentNames.deleteCharAt(agentNames.length()-1);


      byte[] robotsBytes = Files.toByteArray(new File(argv[0]));
      BaseRobotRules rules = robotParser.parseContent(argv[0], robotsBytes, "text/plain", agentNames.toString());


      LineNumberReader testsIn = new LineNumberReader(new FileReader(argv[1]));
      String testPath = testsIn.readLine().trim();
      while (testPath != null) {
        System.out.println( (rules.isAllowed(testPath) ? "allowed" : "not allowed") +
            ":\t" + testPath);
        testPath = testsIn.readLine();
      }
      testsIn.close();
    } catch (Exception e) {

View Full Code Here

0 1

TOP

Related Classes of crawlercommons.robots.BaseRobotRules

org.apache.nutch.fetcher.Fetcher$FetcherThread

org.apache.nutch.fetcher.FetcherReducer$FetcherThread

org.apache.nutch.protocol.ftp.FtpRobotRulesParser

org.apache.nutch.protocol.http.api.HttpRobotRulesParser

org.apache.nutch.protocol.RobotRulesParser

All source code are property of their respective owners. Java is a trademark of Sun Microsystems, Inc and owned by ORACLE Inc. Contact coftware#gmail.com.