Package fr.eolya.crawler

Source Code of fr.eolya.crawler.Crawler

package fr.eolya.crawler;

import java.io.BufferedWriter;
import java.io.File;
import java.io.FileWriter;
import java.io.IOException;
import java.net.UnknownHostException;
import java.util.Arrays;
import java.util.List;
import java.util.Map;
import java.util.concurrent.Executors;
import java.util.concurrent.ThreadPoolExecutor;
import java.util.concurrent.TimeUnit;

import org.apache.commons.io.FileUtils;
import org.apache.commons.lang3.StringUtils;

import gnu.getopt.Getopt;
import fr.eolya.crawler.connectors.ConnectorFactory;
import fr.eolya.crawler.connectors.ISource;
import fr.eolya.crawler.database.CrawlerDBFactory;
import fr.eolya.crawler.database.ICrawlerDB;
import fr.eolya.crawler.queue.ISourceQueue;
import fr.eolya.crawler.queue.QueueFactory;
import fr.eolya.crawler.utils.CrawlerUtils;
import fr.eolya.utils.Logger;
import fr.eolya.utils.Utils;
import fr.eolya.utils.XMLConfig;
import fr.eolya.utils.nosql.DBConnectionFactory;
import fr.eolya.utils.nosql.IDBConnection;

public class Crawler implements ICrawlerController {

  private Logger logger;
  private XMLConfig config;
  private boolean stopRequested;
  private List<String> countryInclude = null;
  private List<String> countryExclude = null;
  private IDBConnection dbConnection = null;
  private ICrawlerDB crawlerDB = null;

  public Crawler(XMLConfig config) throws UnknownHostException {
    this.config = config;
  }

  private static void usage() {
    System.out.println("Usage : java Crawler -p <properties file> [-a <account id>] [-s <source id>] [-c <crawler engine id>] [-o] [-e] [-d] [-r] [-l] [-i]");
    System.out.println("    -a : account id (default = 1)");
    System.out.println("    -s : source id (default = 1)");
    System.out.println("    -c : crawler engine id");
    System.out.println("    -o : once");
    System.out.println("    -e : suspicious only");
    System.out.println("    -d : reset source (-s required)");
    System.out.println("    -k : reset source from cache (-s required)");
    System.out.println("    -r : rescan source (-s required)");
    System.out.println("    -l : rescan source deeper (-s required)");
    System.out.println("    -i : interactive only (reset or rescan source request in web ui)");
    System.out.println("    -t : test only");
    System.out.println("    -v : verbose");
  }

  /**
   * @param args
   */
  public static void main(String[] args) {

    if (args.length == 0) {
      usage();
      System.exit(-1);
    }

    Getopt g = new Getopt("Crawler", args, "p:a:s:c:oerditvk");
    g.setOpterr(false);
    int c;

    boolean once = false;
    boolean suspiciousOnly = false;
    boolean reScan = false;
    boolean interactiveOnly = false;
    boolean reset = false;
    boolean resetFromCache = false;
    boolean deeper = false;
    boolean test = false;
    String propFileName = "";
    String accountId = "1";
    String sourceId = "";
    String engineId = "";
    boolean verbose = false;

    while ((c = g.getopt()) != -1) {
      switch(c) {
      case 'p':
        propFileName = g.getOptarg();
        break;

      case 'a':
        accountId = g.getOptarg();
        if ("all".equals(accountId)) accountId="";
        break;

      case 's':
        sourceId = g.getOptarg();
        break;

      case 'c':
        engineId = g.getOptarg();
        break;

      case 'o':
        once = true;
        break;

      case 'i':
        interactiveOnly = true;
        break;

      case 'e':
        suspiciousOnly = true;
        once = true;
        break;

      case 'r':
        reScan = true;
        break;

      case 'k':
        resetFromCache = true;
        break;

      case 'd':
        reset = true;
        break;

      case 'l':
        deeper = true;
        break;

      case 't':
        test = true;
        once = true;
        break;

      case 'v':
        verbose = true;
        break;
      }
    }

    if ((reset || reScan || deeper || resetFromCache) && "".equals(sourceId)) {
      System.out.println("Error: source required with -r or -d option");
      System.exit(-1);
    }

    if ("".equals(propFileName)) {
      System.out.println("Error: no properties file specified");
      System.exit(-1);
    }

    propFileName = Utils.getValidPropertyPath(propFileName, null, "HOME");
    System.out.println("Config file = " + propFileName);

    XMLConfig config = new XMLConfig();
    try {
      File configFile =new File(propFileName);
      if (!configFile.exists()) {
        System.out.println("Error configuration file not found [" + propFileName + "]");
        System.exit(-1);
      }
      config.loadFile(propFileName);
    }
    catch(IOException e) {
      System.out.println("Error while reading properties file");
      e.printStackTrace();
      System.exit(-1);     
    }

    String witnessFilesPath = config.getProperty("/crawler/param[@name='witness_files_path']");
    witnessFilesPath = Utils.getValidPropertyPath(witnessFilesPath, null, "HOME");
    if (witnessFilesPath == null || "".equals(witnessFilesPath)) {
      System.out.println("Error : missing witness_files_path propertie");
      System.exit(-1);
    }

    String pidFileName = witnessFilesPath + "/crawler.pid";
    if (test) pidFileName = witnessFilesPath + "/crawler_test.pid";
    File filePid = new File(pidFileName);
    if (filePid.exists()) {
      System.out.println("A crawler instance is already running");
      System.exit(-1);
    }

    try {
      BufferedWriter out = new BufferedWriter(new FileWriter(filePid.getAbsolutePath()));
      out.write(Utils.getProcessId());
      out.close();
    }
    catch (Exception e) {
      System.out.println("Error while creating file : " + filePid.getAbsolutePath());
      e.printStackTrace();
      System.exit(-1);
    }

    try {
      Crawler crawler = new Crawler(config);
      crawler.run(accountId, sourceId, engineId, once, suspiciousOnly, interactiveOnly, reScan, reset, deeper, resetFromCache, test, verbose);
    }
    catch (Exception e) {
      e.printStackTrace();
    }
    filePid.delete();
  }

  public void run(
      String accountId,
      String sourceId,
      String engineId,
      boolean once,
      boolean suspiciousOnly,
      boolean interactiveOnly,
      boolean reScan,
      boolean reset,
      boolean deeper,
      boolean resetFromCache,
      boolean test,
      boolean verbose)
          throws InstantiationException, IllegalAccessException {

    boolean forceOnce = ("1".equals(config.getProperty("/crawler/param[@name='force_once']", "0")));
    if (forceOnce) once = true;

    String logFileName = "";
    if (test) logFileName = config.getProperty("/crawler/param[@name='log_filename_test']");
    if ("".equals(logFileName)) logFileName = config.getProperty("/crawler/param[@name='log_filename']");
    logFileName = Utils.getValidPropertyPath(logFileName, null, "HOME");
    logger = new Logger(logFileName);

    String witnessFilesPath = Utils.getValidPropertyPath(config.getProperty("/crawler/param[@name='witness_files_path']"), null, "HOME");

    String pidFileName = witnessFilesPath + "/crawler.pid";
    if (test) pidFileName = witnessFilesPath + "/crawler_test.pid";

    String stopFileName = witnessFilesPath + "/crawler.stop";
    if (test) stopFileName = witnessFilesPath + "/crawler_test.stop";

    File filePid = new File(pidFileName);
    File fileStop = new File(stopFileName);

    countryInclude = Arrays.asList(config.getProperty("/crawler/param[@name='country_include']", "").replaceAll("\\s*", "").split(","));
    countryExclude = Arrays.asList(config.getProperty("/crawler/param[@name='country_exclude']", "").replaceAll("\\s*", "").split(","));

    stopRequested = false;

    int limit = Integer.parseInt(config.getProperty("/crawler/param[@name='max_simultaneous_source']", "4"));
    if (!"".equals(sourceId)) limit = 1;

    dbConnection = getDBConnection(true);
    if (dbConnection==null) {
      logger.log("Failed to connect do database !");
      System.out.println("Failed to connect do database !");
      return;
    }
   
    String dbType = config.getProperty("/crawler/database/param[@name='dbtype']", "");
    String dbName = config.getProperty("/crawler/database/param[@name='dbname']", "");

    String dbNameQueues = config.getProperty("/crawler/queues/param[@name='dbname']", "");
   
    ISourceQueue sourceQueue = QueueFactory.getSourceQueueInstance(dbType, dbConnection, dbName, "sources", test, interactiveOnly, suspiciousOnly, accountId, sourceId, engineId);
    crawlerDB = CrawlerDBFactory.getCrawlerDBInstance(dbType, dbConnection, dbName, dbNameQueues, logger);

    logger.log("=================================");
    logger.log("Crawler starting (version: " + StringUtils.trimToEmpty(crawlerDB.getVersion()) + ")");
    logger.log("    Simultaneous sources crawled : " + String.valueOf(limit));
    if (!"".equals(accountId))
      logger.log("    account : " + accountId);
    if (!"".equals(engineId))
      logger.log("    engine : " + engineId);
    if (once)
      logger.log("    mode once");
    if (!"".equals(sourceId))
      logger.log("    source : " + sourceId);
    if (suspiciousOnly)
      logger.log("    mode suspicious");
    if (reScan)
      logger.log("    mode rescan");
    if (reset)
      logger.log("    mode reset");
    if (deeper)
      logger.log("    mode deeper");
    if (interactiveOnly)
      logger.log("    mode interactive only");
    if (test)
      logger.log("    mode test");
    if (verbose)
      logger.log("    mode verbose");
    logger.log("");

    logger.log("=================================");
    logger.log("");

    crawlerDB.fixStartupSourcesStatus();

    ThreadPoolExecutor sourceExecutor = (ThreadPoolExecutor) Executors.newFixedThreadPool(limit);

    boolean bFinished = false;
    while (!stopRequested && !bFinished) {
      try {
        stopRequested = fileStop.exists() || !filePid.exists();
        if (stopRequested) break;
       
        // Refresh PID file time
        try {
          FileUtils.touch(filePid);
        }
        catch (Exception e) {
          e.printStackTrace();
          System.exit(-1);
        }

        // How many sources are enqueued ?
        long countSource = sourceQueue.size();
        logger.log("    Sources to be crawled : " + String.valueOf(countSource));

        // If all threads are still processing a source wait and retry
        if (!sourceExecutor.getQueue().isEmpty() && (sourceExecutor.getActiveCount()==limit)) { 
          logger.log("    All threads are busy : wait and retry in a few seconds");
          Utils.sleep(15000)
          continue;
        }

        // We pop a new source from source queue only if thread pool queue is empty
        // we want to pop a new source from source queue at the very last time
        // as source queue content can change at any time

        if (countSource==0 && once) {
          // try waiting 5 minutes
          logger.log("    No more source to crawl : start waiting 5 minutes");
          if ("".equals(sourceId)) {
            int waitingSince = 0;
            stopRequested = fileStop.exists() || !filePid.exists();
            while (waitingSince<300*1000 && countSource==0 && !stopRequested) {
              Utils.sleep(5000);
              waitingSince += 5000;
              countSource = sourceQueue.size();
              stopRequested = fileStop.exists() || !filePid.exists();
            }
          }
          if (countSource == 0) {
            // No source to crawl after waiting 5 minutes => stop crawling
            if (!stopRequested) logger.log("    No more source to crawl after waiting 5 minutes and mode once : stop crawling");
            bFinished = true;
            continue;
          }
        }

        Map<String,Object> srcData = sourceQueue.pop();
        if (srcData!=null) {
          String srcId = String.valueOf(srcData.get("id"));

          if (CrawlerUtils.isAcceptedCountry((String)srcData.get("country"), countryInclude, countryExclude) || !"".equals(sourceId)) {

            String sourceCrawlMode = CrawlerUtils.getSourceCrawlMode(Integer.parseInt((String)srcData.get("crawl_mode")), reScan, reset, deeper, resetFromCache) ;

            // Build the source item according to its type and so its class
            ISource src = ConnectorFactory.getSourceInstance(crawlerDB.getSourceClass((String)srcData.get("type")), srcId, sourceCrawlMode, srcData);

            if (src!=null && src.isCrawlAllowedBySchedule()) {
              logger.log("        Pushing source : " + String.valueOf(src.getId()));
              sourceExecutor.submit(new ProcessorSource(src, config, logger, this));
            } else {
              sourceQueue.unpop(Integer.valueOf(srcId));
              logger.log("        Skip source due to schedule : " + String.valueOf(src.getId()));                     
            }
          } else {
            sourceQueue.unpop(Integer.valueOf(srcId));
            logger.log("        Skip source due to country : " + srcId);                                                       
          }
        }
      } catch (Exception e) {
        e.printStackTrace();
      }

      // Stop requested ?
      stopRequested = fileStop.exists() || !filePid.exists();
      if (!bFinished && !stopRequested) Utils.sleep(15000);           
    }

    // Stop threads
    CrawlerUtils.executorShutdownAndWait(sourceExecutor, 0, 1, TimeUnit.DAYS);
  }

  /*
   * (non-Javadoc)
   * @see fr.eolya.crawler.IController#stopRequested()
   */
  public boolean stopRequested() {
    return stopRequested;
  }
  public IDBConnection getDBConnection(boolean forceReconnect) {
    if (dbConnection==null || forceReconnect) {
      if (dbConnection!=null) {
        dbConnection.close();
        dbConnection = null;
      }
      String dbType = config.getProperty("/crawler/database/param[@name='dbtype']", "");
      String dbHost = config.getProperty("/crawler/database/param[@name='dbhost']", "localhost");
      int dbPort = Integer.parseInt(config.getProperty("/crawler/database/param[@name='dbport']", ""));
      String dbUser = config.getProperty("/crawler/database/param[@name='dbuser']", "");
      String dbPassword = config.getProperty("/crawler/database/param[@name='dbpassword']", "");
      dbConnection = DBConnectionFactory.getDBConnectionInstance(dbType, dbHost, dbPort, dbUser, dbPassword);
    }
    return dbConnection;
  }

  public ICrawlerDB getCrawlerDB() {
    return crawlerDB;
  }
}
TOP

Related Classes of fr.eolya.crawler.Crawler

TOP
Copyright © 2018 www.massapi.com. All rights reserved.
All source code are property of their respective owners. Java is a trademark of Sun Microsystems, Inc and owned by ORACLE Inc. Contact coftware#gmail.com.