Examples of CrawlConfig


Examples of edu.uci.ics.crawler4j.crawler.CrawlConfig

     *
     * @param storageFolder location used to store the temporary data structures used by the crawler.
     */
    public SiteCrawler(File storageFolder) {
        try {
            crawlConfig = new CrawlConfig();
            crawlConfig.setCrawlStorageFolder( storageFolder.getAbsolutePath() );
            crawlConfig.setUserAgentString("Apache Any23 Web Crawler");
           
            final PageFetcher pageFetcher = new PageFetcher(crawlConfig);

View Full Code Here

Examples of edu.uci.ics.crawler4j.crawler.CrawlConfig

     * numberOfCrawlers shows the number of concurrent threads that should
     * be initiated for crawling.
     */
    int numberOfCrawlers = Integer.parseInt(args[1]);

    CrawlConfig config = new CrawlConfig();

    config.setCrawlStorageFolder(crawlStorageFolder);

    /*
     * Be polite: Make sure that we don't send more than 1 request per
     * second (1000 milliseconds between requests).
     */
    config.setPolitenessDelay(1000);

    /*
     * You can set the maximum crawl depth here. The default value is -1 for
     * unlimited depth
     */
    config.setMaxDepthOfCrawling(2);

    /*
     * You can set the maximum number of pages to crawl. The default value
     * is -1 for unlimited number of pages
     */
    config.setMaxPagesToFetch(1000);

    /*
     * Do you need to set a proxy? If so, you can use:
     * config.setProxyHost("proxyserver.example.com");
     * config.setProxyPort(8080);
     *
     * If your proxy also needs authentication:
     * config.setProxyUsername(username); config.getProxyPassword(password);
     */

    /*
     * This config parameter can be used to set your crawl to be resumable
     * (meaning that you can resume the crawl from a previously
     * interrupted/crashed crawl). Note: if you enable resuming feature and
     * want to start a fresh crawl, you need to delete the contents of
     * rootFolder manually.
     */
    config.setResumableCrawling(false);

    /*
     * Instantiate the controller for this crawl.
     */
    PageFetcher pageFetcher = new PageFetcher(config);
View Full Code Here

Examples of edu.uci.ics.crawler4j.crawler.CrawlConfig

     * crawlStorageFolder is a folder where intermediate crawl data is
     * stored.
     */
    String crawlStorageFolder = args[0];

    CrawlConfig config1 = new CrawlConfig();
    CrawlConfig config2 = new CrawlConfig();

    /*
     * The two crawlers should have different storage folders for their
     * intermediate data
     */
    config1.setCrawlStorageFolder(crawlStorageFolder + "/crawler1");
    config2.setCrawlStorageFolder(crawlStorageFolder + "/crawler2");

    config1.setPolitenessDelay(1000);
    config2.setPolitenessDelay(2000);

    config1.setMaxPagesToFetch(50);
    config2.setMaxPagesToFetch(100);

    /*
     * We will use different PageFetchers for the two crawlers.
     */
    PageFetcher pageFetcher1 = new PageFetcher(config1);
View Full Code Here

Examples of edu.uci.ics.crawler4j.crawler.CrawlConfig

     * numberOfCrawlers shows the number of concurrent threads that should
     * be initiated for crawling.
     */
    int numberOfCrawlers = Integer.parseInt(args[1]);

    CrawlConfig config = new CrawlConfig();

    config.setCrawlStorageFolder(crawlStorageFolder);

    config.setPolitenessDelay(1000);

    // Unlimited number of pages can be crawled.
    config.setMaxPagesToFetch(-1);

    /*
     * Instantiate the controller for this crawl.
     */
    PageFetcher pageFetcher = new PageFetcher(config);
View Full Code Here

Examples of edu.uci.ics.crawler4j.crawler.CrawlConfig

    }
    String rootFolder = args[0];
    int numberOfCrawlers = Integer.parseInt(args[1]);
    String storageFolder = args[2];

    CrawlConfig config = new CrawlConfig();

    config.setCrawlStorageFolder(rootFolder);

    /*
     * Since images are binary content, we need to set this parameter to
     * true to make sure they are included in the crawl.
     */
    config.setIncludeBinaryContentInCrawling(true);

    String[] crawlDomains = new String[] { "http://uci.edu/" };

    PageFetcher pageFetcher = new PageFetcher(config);
    RobotstxtConfig robotstxtConfig = new RobotstxtConfig();
View Full Code Here

Examples of edu.uci.ics.crawler4j.crawler.CrawlConfig

  private Parser parser;
  private PageFetcher pageFetcher;

  public Downloader() {
    CrawlConfig config = new CrawlConfig();
    parser = new Parser(config);
    pageFetcher = new PageFetcher(config);
  }
View Full Code Here

Examples of edu.uci.ics.crawler4j.crawler.CrawlConfig

      return;
    }
    String rootFolder = args[0];
    int numberOfCrawlers = Integer.parseInt(args[1]);

    CrawlConfig config = new CrawlConfig();
    config.setCrawlStorageFolder(rootFolder);
    config.setMaxPagesToFetch(10);
    config.setPolitenessDelay(1000);

    PageFetcher pageFetcher = new PageFetcher(config);
    RobotstxtConfig robotstxtConfig = new RobotstxtConfig();
    RobotstxtServer robotstxtServer = new RobotstxtServer(robotstxtConfig, pageFetcher);
    CrawlController controller = new CrawlController(config, pageFetcher, robotstxtServer);
View Full Code Here

Examples of edu.uci.ics.crawler4j.crawler.CrawlConfig

     * numberOfCrawlers shows the number of concurrent threads that should
     * be initiated for crawling.
     */
    int numberOfCrawlers = Integer.parseInt(args[1]);

    CrawlConfig config = new CrawlConfig();

    config.setCrawlStorageFolder(crawlStorageFolder);

    /*
     * Be polite: Make sure that we don't send more than 1 request per
     * second (1000 milliseconds between requests).
     */
    config.setPolitenessDelay(1000);

    /*
     * You can set the maximum crawl depth here. The default value is -1 for
     * unlimited depth
     */
    config.setMaxDepthOfCrawling(2);

    /*
     * You can set the maximum number of pages to crawl. The default value
     * is -1 for unlimited number of pages
     */
    config.setMaxPagesToFetch(1000);

    /*
     * Do you need to set a proxy? If so, you can use:
     * config.setProxyHost("proxyserver.example.com");
     * config.setProxyPort(8080);
     *
     * If your proxy also needs authentication:
     * config.setProxyUsername(username); config.getProxyPassword(password);
     */

    /*
     * This config parameter can be used to set your crawl to be resumable
     * (meaning that you can resume the crawl from a previously
     * interrupted/crashed crawl). Note: if you enable resuming feature and
     * want to start a fresh crawl, you need to delete the contents of
     * rootFolder manually.
     */
    config.setResumableCrawling(false);

    /*
     * Instantiate the controller for this crawl.
     */
    PageFetcher pageFetcher = new PageFetcher(config);
View Full Code Here

Examples of org.sf.mustru.crawl.CrawlConfig

   */
  public CrawlConfigWizard(boolean setup)
  { 
   //*-- read from the file or initialize the configuration
   setWindowTitle("Crawl Configuration wizard");
   crawlConfig = new CrawlConfig(setup);
   }
View Full Code Here

Examples of org.sf.mustru.crawl.CrawlConfig

public OnlineIndex(Shell parentShell)
{ super(parentShell);  addMenuBar(); addToolBar(SWT.FLAT);
   args = new String[2];

   //*-- set the index and database directories
   CrawlConfig crawlConfig = new CrawlConfig(false);   //*-- initialize from the properties file
   Constants.setDBDIR( crawlConfig.getDbDir() ); Constants.setINDEXDIR( crawlConfig.getIndexDir() );
   Constants.setWEBDIR(crawlConfig.getWebDir());
   createDbEnv();  
//addStatusLine();
}
View Full Code Here
TOP
Copyright © 2018 www.massapi.com. All rights reserved.
All source code are property of their respective owners. Java is a trademark of Sun Microsystems, Inc and owned by ORACLE Inc. Contact coftware#gmail.com.