Package net.matuschek.spider

Examples of net.matuschek.spider.WebRobot


  public static void main(String[] args)
    throws Exception
  {
    System.out.println("URLs will be logged to urls.txt\n\n");

    WebRobot robby = new WebRobot();
    robby.setStartURL(new URL("http://www.matuschek.net"));
    robby.setMaxDepth(1);
    robby.setSleepTime(0);

    FileWriter logfile = new FileWriter("urls.txt");
    URLLogger log = new URLLogger(logfile);
    robby.setDocManager(log);

    robby.run();
    logfile.close();
  }
View Full Code Here


  }



  public void run() throws Exception {
    WebRobot robby = new WebRobot();
    robby.setStartURL(new URL("http://www.matuschek.net"));
    robby.setMaxDepth(1);
    robby.setSleepTime(0);

    // download only the first 5 documents
    DownloadStopper stopit = new DownloadStopper(5,robby);
    robby.setWebRobotCallback(stopit);

    robby.run();
  }
View Full Code Here

    // get command line options
    GetOpt opt = new GetOpt(argv);
    String option = null;

    JoBoBase jobobase = JoBoBase.createFromXML();
    WebRobot robby = jobobase.getRobot();

    // referer
    option=opt.getOptionString("r");
    if (option != null) {
      robby.setStartReferer(option);
    }
   
    // maximal depth
    option=opt.getOptionString("d");
    if (option != null) {
  try {
    int maxDepth=Integer.parseInt(option);
    robby.setMaxDepth(maxDepth);
  } catch (NumberFormatException e) {
    System.out.println("Wrong number for maxDepth: "+option);
  }
    }
   
    // walk to other hosts ?
    if (opt.getOptionBoolean("o")) {
      robby.setWalkToOtherHosts(true);
    }
   
    // store directory
    option=opt.getOptionString("s");
    if (option != null) {
      basedir=option;
    }
   
    // minimal file size
    option=opt.getOptionString("m");
    if (option != null) {
      try {
  minSize=Integer.parseInt(option);
      } catch (NumberFormatException e) {}
    }
   
    // agent name
    option=opt.getOptionString("a");
    if (option != null) {
      robby.setAgentName(option);
    }
   
    // ignore robots.txt
    if (opt.getOptionBoolean("i")) {
      robby.setIgnoreRobotsTxt(true);
    }
   
    // wait time
    option=opt.getOptionString("w");
    if (option != null) {
      try {
  int waitTime=Integer.parseInt(option);
  robby.setSleepTime(waitTime*1000);
      } catch (NumberFormatException e) {}
    }
   
    // print usage
    if (opt.getOptionBoolean("?")) {
      printUsage();
      return;
    }
   
    URL u = new URL(argv[argv.length-1]);

    HttpDocToFile docStore=new HttpDocToFile(basedir);
    docStore.setMinFileSize(minSize);

    SystemOutHttpToolCallback statusInfo = new SystemOutHttpToolCallback();

    robby.setStartURL(u);
    robby.setDocManager(docStore);
    robby.setHttpToolCallback(statusInfo);
   
    robby.run();
   
  }
View Full Code Here

    throws ClassNotFoundException
  {
    log = Category.getInstance(this.getClass());
    docstore = new HttpDocToFile(storageDirectory);
    initializeFilters();
    robot = new WebRobot();
    robot.setFilters(filters);
  }
View Full Code Here

TOP

Related Classes of net.matuschek.spider.WebRobot

Copyright © 2018 www.massapicom. All rights reserved.
All source code are property of their respective owners. Java is a trademark of Sun Microsystems, Inc and owned by ORACLE Inc. Contact coftware#gmail.com.