Package bixo.tools

Source Code of bixo.tools.ProcessRobotsTool

/*
* Copyright 2009-2013 Scale Unlimited
*
* Licensed under the Apache License, Version 2.0 (the "License");
* you may not use this file except in compliance with the License.
* You may obtain a copy of the License at
*
*     http://www.apache.org/licenses/LICENSE-2.0
*
* Unless required by applicable law or agreed to in writing, software
* distributed under the License is distributed on an "AS IS" BASIS,
* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
* See the License for the specific language governing permissions and
* limitations under the License.
*
*/
package bixo.tools;

import java.io.BufferedReader;
import java.io.IOException;
import java.io.InputStreamReader;
import java.net.URL;

import bixo.fetcher.BaseFetcher;
import bixo.robots.BaseRobotRules;
import bixo.robots.RobotUtils;
import bixo.robots.SimpleRobotRulesParser;
import bixo.utils.ConfigUtils;
import bixo.utils.UrlUtils;

public class ProcessRobotsTool {

    private static String readInputLine() throws IOException {
        InputStreamReader isr = new InputStreamReader(System.in);
        BufferedReader br = new BufferedReader(isr);
       
        try {
            return br.readLine();
        } finally {
            // TODO KKr - will this actually close System.in?
            // Should I reuse this buffered reader? Check out password masking code.
            // br.close();
        }
    }

    /**
     * @param args - URL to fetch
     */
    public static void main(String[] args) {
        System.setProperty("bixo.root.level", "TRACE");
        // Uncomment this to see the wire log for HttpClient
        // System.setProperty("bixo.http.level", "DEBUG");
       
        BaseFetcher fetcher = RobotUtils.createFetcher(ConfigUtils.BIXO_TOOL_AGENT, 1);
       
        boolean interactive = args.length == 0;
        int index = 0;
       
        while (interactive || (index < args.length)) {
          String url;
         
          try {
              if (interactive) {
                System.out.print("URL to fetch: ");
                url = readInputLine();
                if (url.length() == 0) {
                  System.exit(0);
                }
              } else {
                url = args[index++];
              }

              URL robotsUrl = new URL(url);
              if (!robotsUrl.getPath().toLowerCase().endsWith("/robots.txt")) {
                  robotsUrl = new URL(robotsUrl, "/robots.txt");
              }
             
              System.out.println("Processing " + robotsUrl.toExternalForm());
              BaseRobotRules rules = RobotUtils.getRobotRules(fetcher, new SimpleRobotRulesParser(), robotsUrl);
                System.out.println(String.format("Deferred visits = %s, allow all = %s, allow none = %s, top-level allowed = %s",
                                rules.isDeferVisits(),
                                rules.isAllowAll(),
                                rules.isAllowNone(),
                                rules.isAllowed(UrlUtils.makeProtocolAndDomain(url))));
                System.out.println();
          } catch (Exception e) {
            e.printStackTrace(System.out);
               
            if (interactive) {
                System.out.println();
                System.out.flush();
            } else {
              System.exit(-1);
            }
          }
        }
    }

}
TOP

Related Classes of bixo.tools.ProcessRobotsTool

TOP
Copyright © 2018 www.massapi.com. All rights reserved.
All source code are property of their respective owners. Java is a trademark of Sun Microsystems, Inc and owned by ORACLE Inc. Contact coftware#gmail.com.