Package org.archive.modules.net

Examples of org.archive.modules.net.RobotsPolicy


        String httpEquiv = element.getAttributeValue("http-equiv");
        String content = element.getAttributeValue("content");

        if ("robots".equals(name) && content != null) {
            curi.getData().put(A_META_ROBOTS, content);
            RobotsPolicy policy = metadata.getRobotsPolicy();
            String contentLower = content.toLowerCase();
            if (policy.obeyMetaRobotsNofollow()
                 && (contentLower.indexOf("nofollow") >= 0
                 || contentLower.indexOf("none") >= 0)) {
                // if 'nofollow' or 'none' is specified and the
                // honoring policy is not IGNORE or CUSTOM, end html extraction
                logger.fine("HTML extraction skipped due to robots meta-tag " +
View Full Code Here


            return true;
        }
        // test against robots.txt if available
        if (cs.isValidRobots()) {
            String ua = metadata.getUserAgent();
            RobotsPolicy robots = metadata.getRobotsPolicy();
            if(!robots.allows(ua, curi, cs.getRobotstxt())) {
                if(getCalculateRobotsOnly()) {
                    // annotate URI as excluded, but continue to process normally
                    curi.getAnnotations().add("robotExcluded");
                    return false;
                }
View Full Code Here

        TextUtils.recycleMatcher(attr);

        // Look for the 'robots' meta-tag
        if("robots".equalsIgnoreCase(name) && content != null ) {
            curi.getData().put(A_META_ROBOTS, content);
            RobotsPolicy policy = metadata.getRobotsPolicy();
            String contentLower = content.toLowerCase();
            if (policy.obeyMetaRobotsNofollow()
                && (contentLower.indexOf("nofollow") >= 0
                    || contentLower.indexOf("none") >= 0)) {
                // if 'nofollow' or 'none' is specified and the
                // honoring policy is not IGNORE or CUSTOM, end html extraction
                logger.fine("HTML extraction skipped due to robots meta-tag for: "
View Full Code Here

TOP

Related Classes of org.archive.modules.net.RobotsPolicy

Copyright © 2018 www.massapicom. All rights reserved.
All source code are property of their respective owners. Java is a trademark of Sun Microsystems, Inc and owned by ORACLE Inc. Contact coftware#gmail.com.