Package bixo.robots

Examples of bixo.robots.BaseRobotRules


              if (!robotsUrl.getPath().toLowerCase().endsWith("/robots.txt")) {
                  robotsUrl = new URL(robotsUrl, "/robots.txt");
              }
             
              System.out.println("Processing " + robotsUrl.toExternalForm());
              BaseRobotRules rules = RobotUtils.getRobotRules(fetcher, new SimpleRobotRulesParser(), robotsUrl);
                System.out.println(String.format("Deferred visits = %s, allow all = %s, allow none = %s, top-level allowed = %s",
                                rules.isDeferVisits(),
                                rules.isAllowAll(),
                                rules.isAllowNone(),
                                rules.isAllowed(UrlUtils.makeProtocolAndDomain(url))));
                System.out.println();
          } catch (Exception e) {
            e.printStackTrace(System.out);
               
            if (interactive) {
View Full Code Here


               
                LOGGER.debug("Skipping URLs from not-good domain: " + domain);
               
                emptyQueue(_urls, GroupingKey.SKIPPED_GROUPING_KEY, _collector, _flowProcess);
            } else {
                BaseRobotRules robotRules = RobotUtils.getRobotRules(_fetcher, _parser, new URL(domainInfo.getProtocolAndDomain() + "/robots.txt"));

                String validKey = null;
                boolean isDeferred = robotRules.isDeferVisits();
                if (isDeferred) {
                    LOGGER.debug("Deferring visits to URLs from " + domainInfo.getDomain());
                    _flowProcess.increment(FetchCounters.DOMAINS_DEFERRED, 1);
                } else {
                    validKey = GroupingKey.makeGroupingKey(domainInfo.getHostAddress(), robotRules.getCrawlDelay());
                    _flowProcess.increment(FetchCounters.DOMAINS_FINISHED, 1);
                }

                // Use the same key for every URL from this domain
                GroupedUrlDatum datum;
                while ((datum = _urls.poll()) != null) {
                    ScoredUrlDatum scoreUrl;
                    FetchCounters counter;
                    String url = datum.getUrl();

                    if (isDeferred) {
                        counter = FetchCounters.URLS_DEFERRED;
                        scoreUrl = new ScoredUrlDatum(url, GroupingKey.DEFERRED_GROUPING_KEY, UrlStatus.SKIPPED_DEFERRED, 0.0);
                    } else if (!robotRules.isAllowed(url)) {
                        counter = FetchCounters.URLS_BLOCKED;
                        scoreUrl = new ScoredUrlDatum(url, GroupingKey.BLOCKED_GROUPING_KEY, UrlStatus.SKIPPED_BLOCKED, 0.0);
                    } else {
                        double score = _scorer.generateScore(domain, pld, datum);
                        if (score == BaseScoreGenerator.SKIP_SCORE) {
View Full Code Here

TOP

Related Classes of bixo.robots.BaseRobotRules

Copyright © 2018 www.massapicom. All rights reserved.
All source code are property of their respective owners. Java is a trademark of Sun Microsystems, Inc and owned by ORACLE Inc. Contact coftware#gmail.com.