Examples of CrawlConfig

edu.uci.ics.crawler4j.crawler.CrawlConfig
org.sf.mustru.crawl.CrawlConfig
Read, update, and store crawl configuration parameters from data/config/mustru.prp

Examples of edu.uci.ics.crawler4j.crawler.CrawlConfig

     *
     * @param storageFolder location used to store the temporary data structures used by the crawler.
     */
    public SiteCrawler(File storageFolder) {
        try {
            crawlConfig = new CrawlConfig();
            crawlConfig.setCrawlStorageFolder( storageFolder.getAbsolutePath() );
            crawlConfig.setUserAgentString("Apache Any23 Web Crawler");
            
            final PageFetcher pageFetcher = new PageFetcher(crawlConfig);

View Full Code Here

Examples of edu.uci.ics.crawler4j.crawler.CrawlConfig

     * numberOfCrawlers shows the number of concurrent threads that should
     * be initiated for crawling.
     */
    int numberOfCrawlers = Integer.parseInt(args[1]);


    CrawlConfig config = new CrawlConfig();


    config.setCrawlStorageFolder(crawlStorageFolder);


    /*
     * Be polite: Make sure that we don't send more than 1 request per
     * second (1000 milliseconds between requests).
     */
    config.setPolitenessDelay(1000);


    /*
     * You can set the maximum crawl depth here. The default value is -1 for
     * unlimited depth
     */
    config.setMaxDepthOfCrawling(2);


    /*
     * You can set the maximum number of pages to crawl. The default value
     * is -1 for unlimited number of pages
     */
    config.setMaxPagesToFetch(1000);


    /*
     * Do you need to set a proxy? If so, you can use:
     * config.setProxyHost("proxyserver.example.com");
     * config.setProxyPort(8080);
     * 
     * If your proxy also needs authentication:
     * config.setProxyUsername(username); config.getProxyPassword(password);
     */


    /*
     * This config parameter can be used to set your crawl to be resumable
     * (meaning that you can resume the crawl from a previously
     * interrupted/crashed crawl). Note: if you enable resuming feature and
     * want to start a fresh crawl, you need to delete the contents of
     * rootFolder manually.
     */
    config.setResumableCrawling(false);


    /*
     * Instantiate the controller for this crawl.
     */
    PageFetcher pageFetcher = new PageFetcher(config);

View Full Code Here

Examples of edu.uci.ics.crawler4j.crawler.CrawlConfig

     * crawlStorageFolder is a folder where intermediate crawl data is
     * stored.
     */
    String crawlStorageFolder = args[0];


    CrawlConfig config1 = new CrawlConfig();
    CrawlConfig config2 = new CrawlConfig();


    /*
     * The two crawlers should have different storage folders for their
     * intermediate data
     */
    config1.setCrawlStorageFolder(crawlStorageFolder + "/crawler1");
    config2.setCrawlStorageFolder(crawlStorageFolder + "/crawler2");


    config1.setPolitenessDelay(1000);
    config2.setPolitenessDelay(2000);


    config1.setMaxPagesToFetch(50);
    config2.setMaxPagesToFetch(100);


    /*
     * We will use different PageFetchers for the two crawlers.
     */
    PageFetcher pageFetcher1 = new PageFetcher(config1);

View Full Code Here

Examples of edu.uci.ics.crawler4j.crawler.CrawlConfig

     * numberOfCrawlers shows the number of concurrent threads that should
     * be initiated for crawling.
     */
    int numberOfCrawlers = Integer.parseInt(args[1]);


    CrawlConfig config = new CrawlConfig();


    config.setCrawlStorageFolder(crawlStorageFolder);


    config.setPolitenessDelay(1000);


    // Unlimited number of pages can be crawled.
    config.setMaxPagesToFetch(-1);


    /*
     * Instantiate the controller for this crawl.
     */
    PageFetcher pageFetcher = new PageFetcher(config);

View Full Code Here

Examples of edu.uci.ics.crawler4j.crawler.CrawlConfig

    }
    String rootFolder = args[0];
    int numberOfCrawlers = Integer.parseInt(args[1]);
    String storageFolder = args[2];


    CrawlConfig config = new CrawlConfig();


    config.setCrawlStorageFolder(rootFolder);


    /*
     * Since images are binary content, we need to set this parameter to
     * true to make sure they are included in the crawl.
     */
    config.setIncludeBinaryContentInCrawling(true);


    String[] crawlDomains = new String[] { "http://uci.edu/" };


    PageFetcher pageFetcher = new PageFetcher(config);
    RobotstxtConfig robotstxtConfig = new RobotstxtConfig();

View Full Code Here

Examples of edu.uci.ics.crawler4j.crawler.CrawlConfig


  private Parser parser;
  private PageFetcher pageFetcher;


  public Downloader() {
    CrawlConfig config = new CrawlConfig();
    parser = new Parser(config);
    pageFetcher = new PageFetcher(config);
  }

View Full Code Here

Examples of edu.uci.ics.crawler4j.crawler.CrawlConfig

      return;
    }
    String rootFolder = args[0];
    int numberOfCrawlers = Integer.parseInt(args[1]);


    CrawlConfig config = new CrawlConfig();
    config.setCrawlStorageFolder(rootFolder);
    config.setMaxPagesToFetch(10);
    config.setPolitenessDelay(1000);


    PageFetcher pageFetcher = new PageFetcher(config);
    RobotstxtConfig robotstxtConfig = new RobotstxtConfig();
    RobotstxtServer robotstxtServer = new RobotstxtServer(robotstxtConfig, pageFetcher);
    CrawlController controller = new CrawlController(config, pageFetcher, robotstxtServer);

View Full Code Here

Examples of edu.uci.ics.crawler4j.crawler.CrawlConfig

     * numberOfCrawlers shows the number of concurrent threads that should
     * be initiated for crawling.
     */
    int numberOfCrawlers = Integer.parseInt(args[1]);


    CrawlConfig config = new CrawlConfig();


    config.setCrawlStorageFolder(crawlStorageFolder);


    /*
     * Be polite: Make sure that we don't send more than 1 request per
     * second (1000 milliseconds between requests).
     */
    config.setPolitenessDelay(1000);


    /*
     * You can set the maximum crawl depth here. The default value is -1 for
     * unlimited depth
     */
    config.setMaxDepthOfCrawling(2);


    /*
     * You can set the maximum number of pages to crawl. The default value
     * is -1 for unlimited number of pages
     */
    config.setMaxPagesToFetch(1000);


    /*
     * Do you need to set a proxy? If so, you can use:
     * config.setProxyHost("proxyserver.example.com");
     * config.setProxyPort(8080);
     * 
     * If your proxy also needs authentication:
     * config.setProxyUsername(username); config.getProxyPassword(password);
     */


    /*
     * This config parameter can be used to set your crawl to be resumable
     * (meaning that you can resume the crawl from a previously
     * interrupted/crashed crawl). Note: if you enable resuming feature and
     * want to start a fresh crawl, you need to delete the contents of
     * rootFolder manually.
     */
    config.setResumableCrawling(false);


    /*
     * Instantiate the controller for this crawl.
     */
    PageFetcher pageFetcher = new PageFetcher(config);

View Full Code Here

Examples of org.sf.mustru.crawl.CrawlConfig

   */
  public CrawlConfigWizard(boolean setup) 
  {  
   //*-- read from the file or initialize the configuration
   setWindowTitle("Crawl Configuration wizard");
   crawlConfig = new CrawlConfig(setup);
   }

View Full Code Here

Examples of org.sf.mustru.crawl.CrawlConfig

 public OnlineIndex(Shell parentShell) 
 { super(parentShell);  addMenuBar(); addToolBar(SWT.FLAT);
   args = new String[2];


   //*-- set the index and database directories
   CrawlConfig crawlConfig = new CrawlConfig(false);   //*-- initialize from the properties file
   Constants.setDBDIR( crawlConfig.getDbDir() ); Constants.setINDEXDIR( crawlConfig.getIndexDir() );
   Constants.setWEBDIR(crawlConfig.getWebDir());
   createDbEnv();   
 //addStatusLine();
 }

View Full Code Here

0 1

TOP

All source code are property of their respective owners. Java is a trademark of Sun Microsystems, Inc and owned by ORACLE Inc. Contact coftware#gmail.com.