Examples of BaseRobotRules


Examples of bixo.robots.BaseRobotRules

              if (!robotsUrl.getPath().toLowerCase().endsWith("/robots.txt")) {
                  robotsUrl = new URL(robotsUrl, "/robots.txt");
              }
             
              System.out.println("Processing " + robotsUrl.toExternalForm());
              BaseRobotRules rules = RobotUtils.getRobotRules(fetcher, new SimpleRobotRulesParser(), robotsUrl);
                System.out.println(String.format("Deferred visits = %s, allow all = %s, allow none = %s, top-level allowed = %s",
                                rules.isDeferVisits(),
                                rules.isAllowAll(),
                                rules.isAllowNone(),
                                rules.isAllowed(UrlUtils.makeProtocolAndDomain(url))));
                System.out.println();
          } catch (Exception e) {
            e.printStackTrace(System.out);
               
            if (interactive) {
View Full Code Here

Examples of bixo.robots.BaseRobotRules

               
                LOGGER.debug("Skipping URLs from not-good domain: " + domain);
               
                emptyQueue(_urls, GroupingKey.SKIPPED_GROUPING_KEY, _collector, _flowProcess);
            } else {
                BaseRobotRules robotRules = RobotUtils.getRobotRules(_fetcher, _parser, new URL(domainInfo.getProtocolAndDomain() + "/robots.txt"));

                String validKey = null;
                boolean isDeferred = robotRules.isDeferVisits();
                if (isDeferred) {
                    LOGGER.debug("Deferring visits to URLs from " + domainInfo.getDomain());
                    _flowProcess.increment(FetchCounters.DOMAINS_DEFERRED, 1);
                } else {
                    validKey = GroupingKey.makeGroupingKey(domainInfo.getHostAddress(), robotRules.getCrawlDelay());
                    _flowProcess.increment(FetchCounters.DOMAINS_FINISHED, 1);
                }

                // Use the same key for every URL from this domain
                GroupedUrlDatum datum;
                while ((datum = _urls.poll()) != null) {
                    ScoredUrlDatum scoreUrl;
                    FetchCounters counter;
                    String url = datum.getUrl();

                    if (isDeferred) {
                        counter = FetchCounters.URLS_DEFERRED;
                        scoreUrl = new ScoredUrlDatum(url, GroupingKey.DEFERRED_GROUPING_KEY, UrlStatus.SKIPPED_DEFERRED, 0.0);
                    } else if (!robotRules.isAllowed(url)) {
                        counter = FetchCounters.URLS_BLOCKED;
                        scoreUrl = new ScoredUrlDatum(url, GroupingKey.BLOCKED_GROUPING_KEY, UrlStatus.SKIPPED_BLOCKED, 0.0);
                    } else {
                        double score = _scorer.generateScore(domain, pld, datum);
                        if (score == BaseScoreGenerator.SKIP_SCORE) {
View Full Code Here

Examples of crawlercommons.robots.BaseRobotRules

   * @return {@link BaseRobotRules} holding the rules from robots.txt
   */
  public BaseRobotRules getRobotRulesSet(Protocol http, URL url) {

    String cacheKey = getCacheKey(url);
    BaseRobotRules robotRules = (SimpleRobotRules) CACHE.get(cacheKey);

    boolean cacheRule = true;
   
    if (robotRules == null) {                     // cache miss
      URL redir = null;
View Full Code Here

Examples of crawlercommons.robots.BaseRobotRules

      System.exit(-1);
    }

    try {
      byte[] robotsBytes = Files.toByteArray(new File(argv[0]));
      BaseRobotRules rules = robotParser.parseContent(argv[0], robotsBytes, "text/plain", argv[2]);

      LineNumberReader testsIn = new LineNumberReader(new FileReader(argv[1]));
      String testPath = testsIn.readLine().trim();
      while (testPath != null) {
        System.out.println( (rules.isAllowed(testPath) ? "allowed" : "not allowed") + ":\t" + testPath);
        testPath = testsIn.readLine();
      }
      testsIn.close();
    } catch (Exception e) {
      e.printStackTrace();
View Full Code Here

Examples of crawlercommons.robots.BaseRobotRules

  public BaseRobotRules getRobotRulesSet(Protocol ftp, URL url) {

    String protocol = url.getProtocol().toLowerCase()// normalize to lower case
    String host = url.getHost().toLowerCase();          // normalize to lower case

    BaseRobotRules robotRules = (SimpleRobotRules) CACHE.get(protocol + ":" + host);

    boolean cacheRule = true;

    if (robotRules == null) {                     // cache miss
      if (LOG.isTraceEnabled())
View Full Code Here

Examples of crawlercommons.robots.BaseRobotRules

              if (LOG.isDebugEnabled()) {
                LOG.debug("redirectCount=" + redirectCount);
              }
              redirecting = false;
              Protocol protocol = this.protocolFactory.getProtocol(fit.url.toString());
              BaseRobotRules rules = protocol.getRobotRules(fit.url, fit.datum);
              if (!rules.isAllowed(fit.u.toString())) {
                // unblock
                fetchQueues.finishFetchItem(fit, true);
                if (LOG.isDebugEnabled()) {
                  LOG.debug("Denied by robots.txt: " + fit.url);
                }
                output(fit.url, fit.datum, null, ProtocolStatus.STATUS_ROBOTS_DENIED, CrawlDatum.STATUS_FETCH_GONE);
                reporter.incrCounter("FetcherStatus", "robots_denied", 1);
                continue;
              }
              if (rules.getCrawlDelay() > 0) {
                if (rules.getCrawlDelay() > maxCrawlDelay && maxCrawlDelay >= 0) {
                  // unblock
                  fetchQueues.finishFetchItem(fit, true);
                  LOG.debug("Crawl-Delay for " + fit.url + " too long (" + rules.getCrawlDelay() + "), skipping");
                  output(fit.url, fit.datum, null, ProtocolStatus.STATUS_ROBOTS_DENIED, CrawlDatum.STATUS_FETCH_GONE);
                  reporter.incrCounter("FetcherStatus", "robots_denied_maxcrawldelay", 1);
                  continue;
                } else {
                  FetchItemQueue fiq = fetchQueues.getFetchItemQueue(fit.queueID);
                  fiq.crawlDelay = rules.getCrawlDelay();
                  if (LOG.isDebugEnabled()) {
                    LOG.info("Crawl delay for queue: " + fit.queueID + " is set to " + fiq.crawlDelay + " as per robots.txt. url: " + fit.url);
                  }
                }
              }
View Full Code Here

Examples of crawlercommons.robots.BaseRobotRules

        agentNames.append(argv[counter]).append(",");

      agentNames.deleteCharAt(agentNames.length()-1);

      byte[] robotsBytes = Files.toByteArray(new File(argv[0]));
      BaseRobotRules rules = robotParser.parseContent(argv[0], robotsBytes, "text/plain", agentNames.toString());

      LineNumberReader testsIn = new LineNumberReader(new FileReader(argv[1]));
      String testPath = testsIn.readLine().trim();
      while (testPath != null) {
        System.out.println( (rules.isAllowed(testPath) ? "allowed" : "not allowed") +
            ":\t" + testPath);
        testPath = testsIn.readLine();
      }
      testsIn.close();
    } catch (Exception e) {
View Full Code Here

Examples of crawlercommons.robots.BaseRobotRules

        agentNames.append(argv[counter]).append(",");

      agentNames.deleteCharAt(agentNames.length()-1);

      byte[] robotsBytes = Files.toByteArray(new File(argv[0]));
      BaseRobotRules rules = robotParser.parseContent(argv[0], robotsBytes, "text/plain", agentNames.toString());

      LineNumberReader testsIn = new LineNumberReader(new FileReader(argv[1]));
      String testPath = testsIn.readLine().trim();
      while (testPath != null) {
        System.out.println( (rules.isAllowed(testPath) ? "allowed" : "not allowed") +
            ":\t" + testPath);
        testPath = testsIn.readLine();
      }
      testsIn.close();
    } catch (Exception e) {
View Full Code Here

Examples of crawlercommons.robots.BaseRobotRules

              if (LOG.isDebugEnabled()) {
                LOG.debug("redirectCount=" + redirectCount);
              }
              redirecting = false;
              Protocol protocol = this.protocolFactory.getProtocol(fit.url.toString());
              BaseRobotRules rules = protocol.getRobotRules(fit.url, fit.datum);
              if (!rules.isAllowed(fit.u.toString())) {
                // unblock
                fetchQueues.finishFetchItem(fit, true);
                if (LOG.isDebugEnabled()) {
                  LOG.debug("Denied by robots.txt: " + fit.url);
                }
                output(fit.url, fit.datum, null, ProtocolStatus.STATUS_ROBOTS_DENIED, CrawlDatum.STATUS_FETCH_GONE);
                reporter.incrCounter("FetcherStatus", "robots_denied", 1);
                continue;
              }
              if (rules.getCrawlDelay() > 0) {
                if (rules.getCrawlDelay() > maxCrawlDelay && maxCrawlDelay >= 0) {
                  // unblock
                  fetchQueues.finishFetchItem(fit, true);
                  LOG.debug("Crawl-Delay for " + fit.url + " too long (" + rules.getCrawlDelay() + "), skipping");
                  output(fit.url, fit.datum, null, ProtocolStatus.STATUS_ROBOTS_DENIED, CrawlDatum.STATUS_FETCH_GONE);
                  reporter.incrCounter("FetcherStatus", "robots_denied_maxcrawldelay", 1);
                  continue;
                } else {
                  FetchItemQueue fiq = fetchQueues.getFetchItemQueue(fit.queueID);
                  fiq.crawlDelay = rules.getCrawlDelay();
                  if (LOG.isDebugEnabled()) {
                    LOG.info("Crawl delay for queue: " + fit.queueID + " is set to " + fiq.crawlDelay + " as per robots.txt. url: " + fit.url);
                  }
                }
              }
View Full Code Here

Examples of crawlercommons.robots.BaseRobotRules

  public BaseRobotRules getRobotRulesSet(Protocol http, URL url) {

    String protocol = url.getProtocol().toLowerCase()// normalize to lower case
    String host = url.getHost().toLowerCase();          // normalize to lower case

    BaseRobotRules robotRules = (SimpleRobotRules)CACHE.get(protocol + ":" + host);

    boolean cacheRule = true;
   
    if (robotRules == null) {                     // cache miss
      URL redir = null;
View Full Code Here
TOP
Copyright © 2018 www.massapi.com. All rights reserved.
All source code are property of their respective owners. Java is a trademark of Sun Microsystems, Inc and owned by ORACLE Inc. Contact coftware#gmail.com.