Package edu.uci.ics.crawler4j.crawler

Source Code of edu.uci.ics.crawler4j.crawler.CrawlController

/**
* Licensed to the Apache Software Foundation (ASF) under one or more
* contributor license agreements.  See the NOTICE file distributed with
* this work for additional information regarding copyright ownership.
* The ASF licenses this file to You under the Apache License, Version 2.0
* (the "License"); you may not use this file except in compliance with
* the License.  You may obtain a copy of the License at
*
*     http://www.apache.org/licenses/LICENSE-2.0
*
* Unless required by applicable law or agreed to in writing, software
* distributed under the License is distributed on an "AS IS" BASIS,
* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
* See the License for the specific language governing permissions and
* limitations under the License.
*/

package edu.uci.ics.crawler4j.crawler;

import com.sleepycat.je.Environment;
import com.sleepycat.je.EnvironmentConfig;
import edu.uci.ics.crawler4j.fetcher.PageFetcher;
import edu.uci.ics.crawler4j.frontier.DocIDServer;
import edu.uci.ics.crawler4j.frontier.Frontier;
import edu.uci.ics.crawler4j.robotstxt.RobotstxtServer;
import edu.uci.ics.crawler4j.url.URLCanonicalizer;
import edu.uci.ics.crawler4j.url.WebURL;
import edu.uci.ics.crawler4j.util.IO;
import org.apache.log4j.Logger;

import java.io.File;
import java.util.ArrayList;
import java.util.List;

/**
* The controller that manages a crawling session. This class creates the
* crawler threads and monitors their progress.
*
* @author Yasser Ganjisaffar <lastname at gmail dot com>
*/
public class CrawlController extends Configurable {

  private static final Logger logger = Logger.getLogger(CrawlController.class.getName());

  /**
   * The 'customData' object can be used for passing custom crawl-related
   * configurations to different components of the crawler.
   */
  protected Object customData;

  /**
   * Once the crawling session finishes the controller collects the local data
   * of the crawler threads and stores them in this List.
   */
  protected List<Object> crawlersLocalData = new ArrayList<Object>();

  /**
   * Is the crawling of this session finished?
   */
  protected boolean finished;

  /**
   * Is the crawling session set to 'shutdown'. Crawler threads monitor this
   * flag and when it is set they will no longer process new pages.
   */
  protected boolean shuttingDown;

  protected PageFetcher pageFetcher;
  protected RobotstxtServer robotstxtServer;
  protected Frontier frontier;
  protected DocIDServer docIdServer;

  protected final Object waitingLock = new Object();

  public CrawlController(CrawlConfig config, PageFetcher pageFetcher, RobotstxtServer robotstxtServer)
      throws Exception {
    super(config);

    config.validate();
    File folder = new File(config.getCrawlStorageFolder());
    if (!folder.exists()) {
      if (!folder.mkdirs()) {
        throw new Exception("Couldn't create this folder: " + folder.getAbsolutePath());
      }
    }

    boolean resumable = config.isResumableCrawling();

    EnvironmentConfig envConfig = new EnvironmentConfig();
    envConfig.setAllowCreate(true);
    envConfig.setTransactional(resumable);
    envConfig.setLocking(resumable);

    File envHome = new File(config.getCrawlStorageFolder() + "/frontier");
    if (!envHome.exists()) {
      if (!envHome.mkdir()) {
        throw new Exception("Couldn't create this folder: " + envHome.getAbsolutePath());
      }
    }
    if (!resumable) {
      IO.deleteFolderContents(envHome);
    }

    Environment env = new Environment(envHome, envConfig);
    docIdServer = new DocIDServer(env, config);
    frontier = new Frontier(env, config, docIdServer);

    this.pageFetcher = pageFetcher;
    this.robotstxtServer = robotstxtServer;

    finished = false;
    shuttingDown = false;
  }

  /**
   * Start the crawling session and wait for it to finish.
   *
   * @param _c
   *            the class that implements the logic for crawler threads
   * @param numberOfCrawlers
   *            the number of concurrent threads that will be contributing in
   *            this crawling session.
   */
  public <T extends WebCrawler> void start(final Class<T> _c, final int numberOfCrawlers) {
    this.start(_c, numberOfCrawlers, true);
  }

  /**
   * Start the crawling session and return immediately.
   *
   * @param _c
   *            the class that implements the logic for crawler threads
   * @param numberOfCrawlers
   *            the number of concurrent threads that will be contributing in
   *            this crawling session.
   */
  public <T extends WebCrawler> void startNonBlocking(final Class<T> _c, final int numberOfCrawlers) {
    this.start(_c, numberOfCrawlers, false);
  }

  protected <T extends WebCrawler> void start(final Class<T> _c, final int numberOfCrawlers, boolean isBlocking) {
    try {
      finished = false;
      crawlersLocalData.clear();
      final List<Thread> threads = new ArrayList<Thread>();
      final List<T> crawlers = new ArrayList<T>();

      for (int i = 1; i <= numberOfCrawlers; i++) {
        T crawler = _c.newInstance();
        Thread thread = new Thread(crawler, "Crawler " + i);
        crawler.setThread(thread);
        crawler.init(i, this);
        thread.start();
        crawlers.add(crawler);
        threads.add(thread);
        logger.info("Crawler " + i + " started.");
      }

      final CrawlController controller = this;

      Thread monitorThread = new Thread(new Runnable() {

        @Override
        public void run() {
          try {
            synchronized (waitingLock) {

              while (true) {
                sleep(10);
                boolean someoneIsWorking = false;
                for (int i = 0; i < threads.size(); i++) {
                  Thread thread = threads.get(i);
                  if (!thread.isAlive()) {
                    if (!shuttingDown) {
                      logger.info("Thread " + i + " was dead, I'll recreate it.");
                      T crawler = _c.newInstance();
                      thread = new Thread(crawler, "Crawler " + (i + 1));
                      threads.remove(i);
                      threads.add(i, thread);
                      crawler.setThread(thread);
                      crawler.init(i + 1, controller);
                      thread.start();
                      crawlers.remove(i);
                      crawlers.add(i, crawler);
                    }
                  } else if (crawlers.get(i).isNotWaitingForNewURLs()) {
                    someoneIsWorking = true;
                  }
                }
                if (!someoneIsWorking) {
                  // Make sure again that none of the threads
                  // are
                  // alive.
                  logger.info("It looks like no thread is working, waiting for 10 seconds to make sure...");
                  sleep(10);

                  someoneIsWorking = false;
                  for (int i = 0; i < threads.size(); i++) {
                    Thread thread = threads.get(i);
                    if (thread.isAlive() && crawlers.get(i).isNotWaitingForNewURLs()) {
                      someoneIsWorking = true;
                    }
                  }
                  if (!someoneIsWorking) {
                    if (!shuttingDown) {
                      long queueLength = frontier.getQueueLength();
                      if (queueLength > 0) {
                        continue;
                      }
                      logger.info("No thread is working and no more URLs are in queue waiting for another 10 seconds to make sure...");
                      sleep(10);
                      queueLength = frontier.getQueueLength();
                      if (queueLength > 0) {
                        continue;
                      }
                    }

                    logger.info("All of the crawlers are stopped. Finishing the process...");
                    // At this step, frontier notifies the
                    // threads that were
                    // waiting for new URLs and they should
                    // stop
                    frontier.finish();
                    for (T crawler : crawlers) {
                      crawler.onBeforeExit();
                      crawlersLocalData.add(crawler.getMyLocalData());
                    }

                    logger.info("Waiting for 10 seconds before final clean up...");
                    sleep(10);

                    frontier.close();
                    docIdServer.close();
                    pageFetcher.shutDown();

                    finished = true;
                    waitingLock.notifyAll();

                    return;
                  }
                }
              }
            }
          } catch (Exception e) {
            e.printStackTrace();
          }
        }
      });

      monitorThread.start();

      if (isBlocking) {
        waitUntilFinish();
      }

    } catch (Exception e) {
      e.printStackTrace();
    }
  }

  /**
   * Wait until this crawling session finishes.
   */
  public void waitUntilFinish() {
    while (!finished) {
      synchronized (waitingLock) {
        if (finished) {
          return;
        }
        try {
          waitingLock.wait();
        } catch (InterruptedException e) {
          e.printStackTrace();
        }
      }
    }
  }

  /**
   * Once the crawling session finishes the controller collects the local data
   * of the crawler threads and stores them in a List. This function returns
   * the reference to this list.
   */
  public List<Object> getCrawlersLocalData() {
    return crawlersLocalData;
  }

  protected void sleep(int seconds) {
    try {
      Thread.sleep(seconds * 1000);
    } catch (Exception ignored) {
    }
  }

  /**
   * Adds a new seed URL. A seed URL is a URL that is fetched by the crawler
   * to extract new URLs in it and follow them for crawling.
   *
   * @param pageUrl
   *            the URL of the seed
   */
  public void addSeed(String pageUrl) {
    addSeed(pageUrl, -1);
  }

  /**
   * Adds a new seed URL. A seed URL is a URL that is fetched by the crawler
   * to extract new URLs in it and follow them for crawling. You can also
   * specify a specific document id to be assigned to this seed URL. This
   * document id needs to be unique. Also, note that if you add three seeds
   * with document ids 1,2, and 7. Then the next URL that is found during the
   * crawl will get a doc id of 8. Also you need to ensure to add seeds in
   * increasing order of document ids.
   *
   * Specifying doc ids is mainly useful when you have had a previous crawl
   * and have stored the results and want to start a new crawl with seeds
   * which get the same document ids as the previous crawl.
   *
   * @param pageUrl
   *            the URL of the seed
   * @param docId
   *            the document id that you want to be assigned to this seed URL.
   *
   */
  public void addSeed(String pageUrl, int docId) {
    String canonicalUrl = URLCanonicalizer.getCanonicalURL(pageUrl);
    if (canonicalUrl == null) {
      logger.error("Invalid seed URL: " + pageUrl);
      return;
    }
    if (docId < 0) {
      docId = docIdServer.getDocId(canonicalUrl);
      if (docId > 0) {
        // This URL is already seen.
        return;
      }
      docId = docIdServer.getNewDocID(canonicalUrl);
    } else {
      try {
        docIdServer.addUrlAndDocId(canonicalUrl, docId);
      } catch (Exception e) {
        logger.error("Could not add seed: " + e.getMessage());
      }
    }

    WebURL webUrl = new WebURL();
    webUrl.setURL(canonicalUrl);
    webUrl.setDocid(docId);
    webUrl.setDepth((short) 0);
    if (!robotstxtServer.allows(webUrl)) {
      logger.info("Robots.txt does not allow this seed: " + pageUrl);
    } else {
      frontier.schedule(webUrl);
    }
  }

  /**
   * This function can called to assign a specific document id to a url. This
   * feature is useful when you have had a previous crawl and have stored the
   * Urls and their associated document ids and want to have a new crawl which
   * is aware of the previously seen Urls and won't re-crawl them.
   *
   * Note that if you add three seen Urls with document ids 1,2, and 7. Then
   * the next URL that is found during the crawl will get a doc id of 8. Also
   * you need to ensure to add seen Urls in increasing order of document ids.
   *
   * @param pageUrl
   *            the URL of the page
   * @param docId
   *            the document id that you want to be assigned to this URL.
   *
   */
  public void addSeenUrl(String url, int docId) {
    String canonicalUrl = URLCanonicalizer.getCanonicalURL(url);
    if (canonicalUrl == null) {
      logger.error("Invalid Url: " + url);
      return;
    }
    try {
      docIdServer.addUrlAndDocId(canonicalUrl, docId);
    } catch (Exception e) {
      logger.error("Could not add seen url: " + e.getMessage());
    }
  }

  public PageFetcher getPageFetcher() {
    return pageFetcher;
  }

  public void setPageFetcher(PageFetcher pageFetcher) {
    this.pageFetcher = pageFetcher;
  }

  public RobotstxtServer getRobotstxtServer() {
    return robotstxtServer;
  }

  public void setRobotstxtServer(RobotstxtServer robotstxtServer) {
    this.robotstxtServer = robotstxtServer;
  }

  public Frontier getFrontier() {
    return frontier;
  }

  public void setFrontier(Frontier frontier) {
    this.frontier = frontier;
  }

  public DocIDServer getDocIdServer() {
    return docIdServer;
  }

  public void setDocIdServer(DocIDServer docIdServer) {
    this.docIdServer = docIdServer;
  }

  public Object getCustomData() {
    return customData;
  }

  public void setCustomData(Object customData) {
    this.customData = customData;
  }

  public boolean isFinished() {
    return this.finished;
  }

  public boolean isShuttingDown() {
    return shuttingDown;
  }

  /**
   * Set the current crawling session set to 'shutdown'. Crawler threads
   * monitor the shutdown flag and when it is set to true, they will no longer
   * process new pages.
   */
  public void Shutdown() {
    logger.info("Shutting down...");
    this.shuttingDown = true;
    frontier.finish();
  }
}
TOP

Related Classes of edu.uci.ics.crawler4j.crawler.CrawlController

TOP
Copyright © 2018 www.massapi.com. All rights reserved.
All source code are property of their respective owners. Java is a trademark of Sun Microsystems, Inc and owned by ORACLE Inc. Contact coftware#gmail.com.