Package de.anomic.crawler

Examples of de.anomic.crawler.RobotsTxtEntry


            if (actions.indexOf("robots")>=0) {
                try {
                    final DigestURI theURL = new DigestURI(url);

                  // determine if crawling of the current URL is allowed
                    RobotsTxtEntry robotsEntry;
                    try {
                        robotsEntry = sb.robots.getEntry(theURL, sb.peers.myBotIDs());
                    } catch (final IOException e) {
                        robotsEntry = null;
                    }
                  prop.put("robots-allowed", robotsEntry == null ? 1 : robotsEntry.isDisallowed(theURL) ? 0 : 1);

                    // get the sitemap URL of the domain
                    final MultiProtocolURI sitemapURL = robotsEntry == null ? null : robotsEntry.getSitemap();
                    prop.putXML("sitemap", sitemapURL == null ? "" : sitemapURL.toString());
                } catch (final MalformedURLException e) {}
            }

        }
View Full Code Here


            if (actions.indexOf("robots",0) >= 0) {
                try {
                    final DigestURI theURL = new DigestURI(url);

                  // determine if crawling of the current URL is allowed
                    RobotsTxtEntry robotsEntry;
                    try {
                        robotsEntry = sb.robots.getEntry(theURL, sb.peers.myBotIDs());
                    } catch (final IOException e) {
                        robotsEntry = null;
                        Log.logException(e);
                    }
                  prop.put("robots-allowed", robotsEntry == null ? 1 : robotsEntry.isDisallowed(theURL) ? 0 : 1);
                    prop.putHTML("robotsInfo", robotsEntry.getInfo());

                    // get the sitemap URL of the domain
                    final MultiProtocolURI sitemapURL = robotsEntry == null ? null : robotsEntry.getSitemap();
                    prop.putXML("sitemap", sitemapURL == null ? "" : sitemapURL.toString());
                } catch (final MalformedURLException e) {
                    Log.logException(e);
                }
            }
View Full Code Here

TOP

Related Classes of de.anomic.crawler.RobotsTxtEntry

Copyright © 2018 www.massapicom. All rights reserved.
All source code are property of their respective owners. Java is a trademark of Sun Microsystems, Inc and owned by ORACLE Inc. Contact coftware#gmail.com.