Source Code of com.crawljax.core.CrawljaxController

package com.crawljax.core;


import com.crawljax.browser.BrowserPool;
import com.crawljax.browser.EmbeddedBrowser;
import com.crawljax.condition.browserwaiter.WaitConditionChecker;
import com.crawljax.condition.crawlcondition.CrawlConditionChecker;
import com.crawljax.condition.eventablecondition.EventableConditionChecker;
import com.crawljax.condition.invariant.Invariant;
import com.crawljax.core.configuration.CrawlSpecificationReader;
import com.crawljax.core.configuration.CrawljaxConfiguration;
import com.crawljax.core.configuration.CrawljaxConfigurationReader;
import com.crawljax.core.plugin.CrawljaxPluginsUtil;
import com.crawljax.core.state.Eventable;
import com.crawljax.core.state.StateFlowGraph;
import com.crawljax.oraclecomparator.StateComparator;


import net.jcip.annotations.GuardedBy;


import org.apache.commons.configuration.ConfigurationException;
import org.apache.log4j.Logger;


import java.util.List;
import java.util.concurrent.TimeUnit;


/**
 * The Crawljax Controller class is the core of Crawljax.
 *
 * @author mesbah
 * @version $Id: CrawljaxController.java 446 2010-09-16 09:17:24Z slenselink@google.com $
 */
public class CrawljaxController implements CrawlQueueManager {


  private static final Logger LOGGER = Logger.getLogger(CrawljaxController.class.getName());


  private CrawlSession session;


  private long startCrawl;


  private final StateComparator stateComparator;
  private final CrawlConditionChecker crawlConditionChecker;
  private final EventableConditionChecker eventableConditionChecker;


  private final WaitConditionChecker waitConditionChecker = new WaitConditionChecker();


  // TODO Stefan, Can not be final because, must be created after the loading of the plugins
  private Crawler initialCrawler;


  private final CrawljaxConfigurationReader configurationReader;


  private final List<Invariant> invariantList;


  /**
   * Central thread starting engine.
   */
  private final CrawlerExecutor workQueue;


  private final CandidateElementManager elementChecker;


  private final BrowserPool browserPool;


  /**
   * @param config
   *            the crawljax configuration.
   * @throws ConfigurationException
   *             if the configuration fails.
   */
  public CrawljaxController(final CrawljaxConfiguration config) throws ConfigurationException {
    configurationReader = new CrawljaxConfigurationReader(config);
    CrawlSpecificationReader crawlerReader =
            configurationReader.getCrawlSpecificationReader();


    stateComparator = new StateComparator(crawlerReader.getOracleComparators());
    invariantList = crawlerReader.getInvariants();
    crawlConditionChecker = new CrawlConditionChecker(crawlerReader.getCrawlConditions());
    waitConditionChecker.setWaitConditions(crawlerReader.getWaitConditions());
    eventableConditionChecker =
            new EventableConditionChecker(configurationReader.getEventableConditions());


    elementChecker =
            new CandidateElementManager(eventableConditionChecker, crawlConditionChecker);


    browserPool = new BrowserPool(configurationReader);


    workQueue = init();
  }


  /**
   * @throws ConfigurationException
   *             if the configuration fails.
   * @NotThreadSafe
   */
  private CrawlerExecutor init() throws ConfigurationException {
    LOGGER.info("Starting Crawljax...");


    LOGGER.info("Used plugins:");
    CrawljaxPluginsUtil.loadPlugins(configurationReader.getPlugins());


    if (configurationReader.getProxyConfiguration() != null) {
      CrawljaxPluginsUtil.runProxyServerPlugins(
              configurationReader.getProxyConfiguration());
    }


    LOGGER.info("Embedded browser implementation: " + configurationReader.getBrowser());


    LOGGER.info("Number of threads: "
            + configurationReader.getThreadConfigurationReader().getNumberThreads());


    LOGGER.info(
            "Crawl depth: " + configurationReader.getCrawlSpecificationReader().getDepth());
    LOGGER.info("Crawljax initialized!");


    return new CrawlerExecutor(
            configurationReader.getThreadConfigurationReader().getNumberThreads());
  }


  /**
   * Run Crawljax.
   *
   * @throws CrawljaxException
   *             If the browser cannot be instantiated.
   * @throws ConfigurationException
   *             if crawljax configuration fails.
   * @NotThreadSafe
   */
  public final void run() throws CrawljaxException, ConfigurationException {


    startCrawl = System.currentTimeMillis();


    LOGGER.info(
            "Start crawling with " + configurationReader.getAllIncludedCrawlElements().size()
                    + " crawl elements");


    // Create the initailCrawler
    initialCrawler = new InitialCrawler(this);


    // Start the Crawling by adding the initialCrawler to the the workQueue.
    addWorkToQueue(initialCrawler);


    try {
      // Block until the all the jobs are done
      workQueue.waitForTermination();
    } catch (InterruptedException e) {
      LOGGER.error(e.getMessage(), e);
    }
    
    if (workQueue.isAborted()) {
      LOGGER.warn("It apears to be that the workQueue was Aborted, "
              + "not running postcrawling plugins and not closing the browsers");
      return;
    }
    
    long timeCrawlCalc = System.currentTimeMillis() - startCrawl;
    
    /**
     * Close all the opened browsers, this is run in separate thread to have the post crawl
     * plugins to execute in the meanwhile.
     */
    Thread shutdownThread = browserPool.close();


    // TODO Stefan; Now we "re-request" a browser instance for the PostCrawlingPlugins Thread,
    // this is not ideal...
    EmbeddedBrowser b = null;
    try {
      b = this.getBrowserPool().requestBrowser();
    } catch (InterruptedException e1) {
      LOGGER.warn("Re-Request for a browser was interrupted", e1);
    }
    CrawljaxPluginsUtil.runPostCrawlingPlugins(session);
    this.getBrowserPool().freeBrowser(b);


    this.shutdown(timeCrawlCalc);


    try {
      shutdownThread.join();
    } catch (InterruptedException e) {
      LOGGER.error("could not wait for browsers to close.", e);
    }
    
  }


  /**
   * Retrieve the current session, there is only one session active at a time. So this method by
   * it self is Thread-Safe but actions on the session are NOT!
   *
   * @return the session
   */
  public CrawlSession getSession() {
    return session;
  }


  /**
   * Add work (Crawler) to the Queue of work that need to be done. The class is thread-safe.
   *
   * @param work
   *            the work (Crawler) to add to the Queue
   */
  public final void addWorkToQueue(Crawler work) {
    workQueue.execute(work);
  }


  /**
   * Removes this Crawler from the workQueue if it is present, thus causing it not to be run if it
   * has not already started.
   *
   * @param crawler
   *            the Crawler to remove
   * @return true if the crawler was removed
   */
  public boolean removeWorkFromQueue(Crawler crawler) {
    return workQueue.remove(crawler);
  }


  /**
   * Wait for a given condition. This call is thread safe as the underlying object is thread-safe.
   *
   * @param browser
   *            the browser which requires a wait condition
   */
  public final void doBrowserWait(EmbeddedBrowser browser) {
    this.waitConditionChecker.wait(browser);
  }


  /**
   * TODO Stefan: Remove this synchronization; performance loss is huge! no synchrnization fails
   * because ThreadLocal is not ThreadSafe??? get the stripped version of the dom currently in the
   * browser. This call is thread safe, must be synchronised because there is thread-intefearing
   * bug in the stateComparator.
   *
   * @param browser
   *            the browser instance.
   * @return a stripped string of the DOM tree taken from the browser.
   */
  public synchronized String getStrippedDom(EmbeddedBrowser browser) {
    return this.stateComparator.getStrippedDom(browser);
  }


  /**
   * @deprecated use the {@link #getInitialCrawler()} instead, does exactly the same.
   * @return the crawler used to initiate the Crawling run.
   */
  @Deprecated
  public final Crawler getCrawler() {
    return getInitialCrawler();
  }


  /**
   * Retrieve the initial Crawler used.
   *
   * @return the initialCrawler used to initiate the Crawling run.
   */
  public final Crawler getInitialCrawler() {
    return initialCrawler;
  }


  /**
   * Format the time the current crawl run has taken into a more readable format. Taking now as
   * the end time of the crawling.
   *
   * @return the formatted time in X min, X sec layout.
   */
  private String formatRunningTime() {
    return formatRunningTime(System.currentTimeMillis() - startCrawl);
  }


  /**
   * Format the time the current crawl run has taken into a more readable format.
   *
   * @param timeCrawlCalc
   *            the time to display
   * @return the formatted time in X min, X sec layout.
   */
  private String formatRunningTime(long timeCrawlCalc) {
    return String.format("%d min, %d sec", TimeUnit.MILLISECONDS.toMinutes(timeCrawlCalc),
            TimeUnit.MILLISECONDS.toSeconds(timeCrawlCalc) - TimeUnit.MINUTES.toSeconds(
                    TimeUnit.MILLISECONDS.toMinutes(timeCrawlCalc)));
  }


  /**
   * Terminate the crawling, Stop all threads this will cause the controller which is sleeping to
   * reactive and do the final work....
   *
   * @param isAbort
   *            if set true the terminate must be as an abort not allowing running PostCrawling
   *            plugins.
   */
  @GuardedBy("this")
  public final synchronized void terminate(boolean isAbort) {
    LOGGER.warn("After " + this.formatRunningTime()
            + " the crawling process was requested to terminate @ " + Thread.currentThread());
    browserPool.shutdown();
    workQueue.shutdownNow(isAbort);
    this.shutdown(System.currentTimeMillis() - startCrawl);
  }


  /**
   * The general shutdown procedure without running plugins or using browsers.
   */
  private void shutdown(long timeCrawlCalc) {
    StateFlowGraph stateFlowGraph = this.getSession().getStateFlowGraph();
    for (Eventable c : stateFlowGraph.getAllEdges()) {
      LOGGER.info("Interaction Element= " + c.toString());
    }
    LOGGER.info("Total Crawling time(" + timeCrawlCalc + "ms) ~= "
            + formatRunningTime(timeCrawlCalc));
    LOGGER.info("EXAMINED ELEMENTS: " + elementChecker.numberOfExaminedElements());
    LOGGER.info("CLICKABLES: " + stateFlowGraph.getAllEdges().size());
    LOGGER.info("STATES: " + stateFlowGraph.getAllStates().size());
    LOGGER.info("Dom average size (byte): " + stateFlowGraph.getMeanStateStringSize());
    LOGGER.info("DONE!!!");
  }


  /**
   * The current element checker in use. This call is thread-safe because it returns a final
   * field.
   *
   * @return the elementChecker used to register the checked elements.
   */
  public final ExtractorManager getElementChecker() {
    return elementChecker;
  }


  /**
   * @return the configurationReader
   */
  public CrawljaxConfigurationReader getConfigurationReader() {
    return configurationReader;
  }


  /**
   * @return the browser pool.
   */
  public BrowserPool getBrowserPool() {
    return browserPool;
  }


  /**
   * Return the used CrawlQueueManager, this method is designed for extension purposes. Being able
   * to move the {@link #addWorkToQueue(Crawler)} and {@link #removeWorkFromQueue(Crawler)} out of
   * this class using the interface.
   *
   * @return the crawlQueueManager that is used.
   */
  public CrawlQueueManager getCrawlQueueManager() {
    return this;
  }


  /**
   * @return the invariantList
   */
  public final List<Invariant> getInvariantList() {
    return invariantList;
  }


  /**
   * Install a new CrawlSession.
   *
   * @param session
   *            set the new value for the session
   */
  public void setSession(CrawlSession session) {
    this.session = session;
  }


  /**
   * @return the startCrawl
   */
  public final long getStartCrawl() {
    return startCrawl;
  }


  @Override
  public void waitForTermination() throws InterruptedException {
    this.workQueue.waitForTermination();
  }


}
Source Code of com.crawljax.core.CrawljaxController

Related Classes of com.crawljax.core.CrawljaxController