Package org.sf.mustru.crawl

Source Code of org.sf.mustru.crawl.CrawlManager

package org.sf.mustru.crawl;

import java.lang.RuntimeException;
import java.util.Date;
import java.util.Enumeration;
import java.util.ArrayList;
import java.util.Properties;

import org.sf.mustru.utils.Constants;
import org.sf.mustru.utils.DbTools;
import org.sf.mustru.utils.StandardBgramAnalyzer;
import org.sf.mustru.utils.StandardBgramTokenizerFactory;
import org.sf.mustru.utils.StringTools;
import org.sf.mustru.utils.TrainSpellChecker;

import org.apache.log4j.Logger;
import org.apache.lucene.analysis.Analyzer;
import org.apache.lucene.index.IndexWriter;
import org.eclipse.core.internal.runtime.HashMapOfString;

import com.aliasi.lm.NGramProcessLM;
import com.aliasi.spell.FixedWeightEditDistance;
import com.aliasi.tokenizer.TokenizerFactory;

* Manage the crawl <br>
* <ol>
<li> Start the crawl - open the index and databases <br>
<li> Read the task file from crawlTask and start the individual crawl threads <br>
<li> Wait for the crawl threads to end and dump statistics <br>
* </ol>
public class CrawlManager
static Logger logger = Logger.getLogger(CrawlManager.class.getName());
private static CrawlManager ctRef = null;

//*-- timers and statistics
private HashMapOfString stats = new HashMapOfString()//*-- a hash to keep track of statistics by media type
private String[] timers = Constants.getTIMERS();    //*-- a list of timers
private HashMapOfString timerHash = new HashMapOfString()//*-- hash to keep track of timers of various events
private String[] docTypes = null;        //*-- types of documents that will be processed

//*-- Lucene vars
private IndexWriter iw = null;      //*-- Filesystem based indexWriter object for Lucene
private FSDirectory fsd;      //*-- Filesystem directory to store the index
private Analyzer analyzer;      //*-- tokenizer for the search engine

//*-- Lingpipe vars
private TrainSpellChecker sc = null;     //*-- spell checker from Lingpipe
private final static int NGRAM_LENGTH = 5;     private final static double MATCH_WEIGHT = -0.0;
private final static double DELETE_WEIGHT = -4.0;   private final static double INSERT_WEIGHT = -1.0;
private final static double SUBSTITUTE_WEIGHT = -2.0;   private final static double TRANSPOSE_WEIGHT = -2.0;
//*-- vars for the task file
private RandomAccessFile taskFile = null//*-- handle to read the task file
private long[] filePos = null;      //*-- list of file positions in task file
private int[] currentDoc = null;    //*-- current document positions in each thread
private boolean enoughResources;    //*-- flag to indicate a thread ran out of memory

  * Create a single instance of this class
private CrawlManager() {  }
public static CrawlManager getCrawlManager() throws NullPointerException
{ if (ctRef != null)
     { logger.warn("Cannot run two instances of CrawlManager");
       throw new RuntimeException("Cannot run two instances of CrawlManager"); }

   //*-- instantiate a new crawl
   ctRef = new CrawlManager();
   if (Constants.getDbt() == null)
   { DbTools dbt = new DbTools();
     dbt.openEnv(Constants.getDBDIR(), false); Constants.setDbt(dbt);

public static void resetCrawlManager() { ctRef = null; }

  * Initialize the crawl. <br><br>
  * 1. Create a Lucene IndexWriter <br>
  * 2. Create the Berkeley databases <br>
  * 3. Initialize a timer hash <br>
  * @param crawlConfig  Crawl configuration object
public void initCrawl(CrawlConfig crawlConfig)
  boolean freshCrawl = crawlConfig.isFreshCrawl();
  int numThreads = crawlConfig.getNumThreads();
  initTime =- new Date().getTime();

  Properties props = new Properties();
  try { props.load(new BufferedInputStream(new FileInputStream(Constants.DOCTYPES_FILE))); }
  catch ( IOException e) { ctRef.cleanUp("Could not open " + Constants.DOCTYPES_FILE + " " + e.getMessage()); }

  //*-- load the types of media from the docTypes properties file
  docTypes = new String[props.size()]; int i = 0; Integer integerZero = new Integer(0);
  for (Enumeration keys = props.propertyNames(); keys.hasMoreElements(); )
  { String key = (String) keys.nextElement(); stats.put(key, integerZero.toString() );
  docTypes[i++] = key; }
  stats.put("duplicates", integerZero.toString() );
  stats.put("indexed earlier", integerZero.toString() );

  //*-- create the single filesystem based Lucene IndexWriter
  //*-- FreshIndex value :   -1  = freshCrawl
  //*--         0  = false
  //*--         *  = true
  boolean freshIndex = (crawlConfig.getFreshIndex() == -1) ? freshCrawl:
         (crawlConfig.getFreshIndex() ==  0) ? false: true;
   fsd = FSDirectory.getDirectory(new File(Constants.getINDEXDIR()), freshIndex);
   analyzer = new StandardBgramAnalyzer(); iw = new IndexWriter(fsd, analyzer, freshIndex);
   iw.setSimilarity(new SearchSimilarity());
  catch (IOException ie) { ctRef.cleanUp("Could not get IndexWriter " + ie.getMessage() ); }

  //*-- create the spell checker
  if (crawlConfig.isSpellCheck())
  { FixedWeightEditDistance fixedEdit = new FixedWeightEditDistance( MATCH_WEIGHT, DELETE_WEIGHT, INSERT_WEIGHT,
    NGramProcessLM lm = new NGramProcessLM(NGRAM_LENGTH);
    TokenizerFactory tokenizerFactory = new StandardBgramTokenizerFactory(false)//*-- do not extract entities
    try { if ( (new File(Constants.SPELL_TRAIN_MODEL).exists() ) && !crawlConfig.isFreshCrawl() )
             lm = readModel(Constants.SPELL_TRAIN_MODEL)
          sc = new TrainSpellChecker(lm, fixedEdit, tokenizerFactory); }
    catch (IOException ie) { logger.error("IO Error: Could not read spell train file " + ie.getMessage()); }
    catch (ClassNotFoundException ce) { logger.error("Class error: " + ce.getMessage()); }
  //*-- Create the databases, if necessary
  DbTools dbt = Constants.getDbt();
  boolean createFlag = true; boolean dupFlag = false
  if (freshCrawl)
   //*-- initialize and create a new database for the list of extracted files and a secondary
   //*-- database for the file signature
   dbt.createDB(Constants.EXT_FILES_DB, createFlag, dupFlag);"Created py. database " + Constants.EXT_FILES_DB);
   SecKeyDoc skd = new SecKeyDoc( new IndexableDoc().getBdbBinding() );
   if (dbt.createSecDB(Constants.EXT_FILES_SECDB, false, skd))"Created sec. database " + Constants.EXT_FILES_SECDB);
   dbt.closeSecDB(); dbt.closeDB();

   //*-- create the messages database
   dbt.createDB(Constants.EXT_MESSAGES_DB, createFlag, dupFlag);"Created py. database " + Constants.EXT_MESSAGES_DB);


  //*-- initialize the timer hash
  timers = Constants.getTIMERS();
  for (int j = 0; j < timers.length ; j++)
   for (int k = 0; k < numThreads; k++)
    timerHash.put(timers[j] + "_" + k, " ");

  initTime += new Date().getTime()

  *  Scan the contents of the task file. Each line contains the path to an indexable document
  *  on the filesystem. Use the thread number and number of threads to distribute the load evenly
  *  across all threads. The text for each file is extracted, classified, and indexed.
  * @param crawlConfig Crawl configuration object
  * @param crawlThread List of index doc threads
public void startThreads(CrawlConfig crawlConfig, CrawlThread[] crawlThread)
  //*-- read the list of files to process into an array
  int numThreads = crawlConfig.getNumThreads()
  ArrayList<Long> fileList = new ArrayList<Long>();
  { taskFile = new RandomAccessFile(Constants.TASK_FILE, "r");;
    int i = 0; fileList.add(i, new Long( taskFile.getFilePointer() ) );
    String fileName = "";
    LOOP: while ((fileName = taskFile.readLine()) != null)
    { if ( (crawlConfig.getStartPosition() <= 0) && (!(new File(fileName).canRead())) )
        continue LOOP;  //*-- make sure that the file is readable before adding to the list
      fileList.add(i, new Long( taskFile.getFilePointer() ) );
  catch (FileNotFoundException fe)
  { cleanUp("Could not find task file " + Constants.TASK_FILE + " " + fe.getMessage())}
  catch (IOException ie)
  { cleanUp("Could not read task file " + Constants.TASK_FILE + " " + ie.getMessage())}

  //*-- set the file positions for each file in the task file
  filePos = new long[fileList.size() - 1];
  for (int i = 0; i < filePos.length; i++) filePos[i] = ( (Long) fileList.get(i)).longValue();
  fileList = null;

  //*-- set the Lucene parameter for buffering documents
  iw.setMaxBufferedDocs( (totalNumFiles() > 9) ? Constants.LUCENE_MAX_BUFFERED_DOCS: 2);

  //*-- initially, set the current document positions and passage counts
  if (currentDoc == null)
  { currentDoc = new int[numThreads];
    for (int i = 0; i < currentDoc.length; i++) currentDoc[i] = -1;

  //*-- Start threads to process documents in the task file, stagger the start of threads

  for (int i = 0; i < numThreads; i++)
   try { crawlThread[i] = new CrawlThread (i, crawlConfig, this); }
   catch (IOException ie) { cleanUp("Could not create independent threads " + ie.getMessage() ); }   
   try { Thread.sleep(100); } catch (InterruptedException e) { }

} //*-- end of startThreads

  * Increment the log to track the number and type of files processed
  * @param docType
public synchronized void updateLogdata (String docType)
  if (docType == null) docType = "unknown";
  Integer count = Integer.parseInt(stats.get(docType) );
  if ( count == null) { docType = "unknown"; count = Integer.parseInt(stats.get(docType)); }
  count =  count + 1;
  stats.put(docType, count.toString());
} //*-- end of update log

  * Dump the log to a string
  * @return String containing the statistics for the crawl
public String dumpLogdata(int numThreads)
  StringBuffer logData = new StringBuffer(); int numFiles = 0;
  Integer io; String newLine = Constants.NEWLINE;
  logData.append("----------------------------------------------" + newLine);
  for (int i = 0; i < docTypes.length; i++)
  { io =   Integer.parseInt( stats.get( docTypes[i] ) )
  logData.append("No. of " + docTypes[i] + " files: " + io.intValue() + newLine);
  numFiles += io.intValue();
  io = Integer.parseInt(stats.get("duplicates") );
  logData.append("No. of duplicates: " + io.intValue() + newLine);
  numFiles += io.intValue();
  io = Integer.parseInt(stats.get("indexed earlier") );
  logData.append("No. indexed earlier: " + io.intValue() + newLine);
  numFiles += io.intValue();

  logData.append("----------------------------------------------" + newLine);
  logData.append("Total no. of files: " + numFiles + newLine);
  return (logData.toString());
} //*-- end of dump log

  * Called at the end of shutdown
  * @param msg
public synchronized void cleanUp(String msg)
  //*-- clean up the Lucene index
  try { if (iw != null)
         {"Optimizing Lucene index...");
           iw.close(); } }
  catch (IOException ie) { logger.error("Could not optimize Lucene index " + ie.getMessage() ); }

  //*-- close the task file
  try { if (taskFile != null) taskFile.close(); }
  catch (IOException ie) {  }

  if (Constants.getDbt() != null) Constants.getDbt().closeEnv();
  //*-- dump the spell checker model
  if (sc != null)
  { try { sc.pruneTokens(5);
          BufferedOutputStream bufOut = new BufferedOutputStream( new FileOutputStream(Constants.SPELL_CHECK_MODEL));
          ObjectOutputStream objOut = new ObjectOutputStream(bufOut); sc.compileTo(objOut);
          objOut.close(); bufOut.close();
          bufOut = new BufferedOutputStream( new FileOutputStream(Constants.SPELL_TRAIN_MODEL));
          objOut = new ObjectOutputStream(bufOut); sc.dumpTo(objOut);
          objOut.close(); bufOut.close();       
    catch (IOException ie) { logger.error("IO Error: " + ie.getMessage()); }
  if (!msg.equals(""))
  { logger.error(msg); logger.error("ERROR: This thread was aborted")}

public void dumpTimers(int numThreads)
  //*-- dump the headers
  System.out.println("Profile of Crawl");
  System.out.println(""); System.out.print("Timer Type\t\t");

  for (int i = 0; i < numThreads; i++) System.out.print( StringTools.fillin("Thread " + i,  10, true, ' ', 2) );

  //*-- dump the timers
  for (int i = 0; i < timers.length; i++)
   System.out.print(StringTools.fillin(timers[i], 22, true, ' ', 22 - timers[i].length() ) );
   for (int j = 0; j < numThreads; j++)
   { String timeT = (String) timerHash.get(timers[i] + "_" + j);
   System.out.print( StringTools.fillin( timeT, 10, false, ' ', 10 - timeT.length() ) ); }

  System.out.println(""); System.out.print("Timer Type\t\t");

  System.out.print( StringTools.fillin("Main Thread"15, true, ' ', 2) );
  System.out.println(""); System.out.println("-----------------------------------------------------");

  //*-- dump the timers
  System.out.print(StringTools.fillin("initTime", 22, true, ' ', 22 - 8) );
  System.out.print(StringTools.fillin("fileReadTime", 22, true, ' ', 22 - 12) );

  *  Terminate all threads
  * @param crawlThread
public void endThreads (CrawlThread[] crawlThread)
  //*-- signal all threads to stop running
  for (int i = 0; i < crawlThread.length; i++) crawlThread[i].endThread();

  //*-- wait till all threads complete...
  LOOP: for (int i = 0; i < crawlThread.length; i++)
  { int numTimes = 0;
    while ( (crawlThread[i] != null) && crawlThread[i].isAlive() )
    {"Waiting for indexing thread to terminate...");   
      try { Thread.sleep(3000); } catch (InterruptedException ie) { }
      if (numTimes++ < 100) continue LOOP;
      crawlThread[i] = null;
    } //*-- end of while
  } //*-- end of for


  * Crawl Threads call this function to update the timers
  * @param threadNum Integer thread number
  * @param timerType String type of timer
  * @param timerval long time in msecs.
public synchronized void updateTimers(int threadNum, String timerType, long timerval)
String key = timerType + "_" + threadNum;
    timerHash.put(key, Long.toString(timerval) ); }

  * return the file name at the index position in the task file
  * @param index line position in the task file
  * @return String
  * @throws IOException
public String getFileName(int index) throws IOException
{ if (taskFile == null) return ("");[index]); String fileName = taskFile.readLine();
   return ( (fileName == null) ? "": fileName.trim() );

  * return the number of files processed so far
public int getTotalFilesProcessed()
  String[] keys = stats.keys(); int numFiles = 0;
  for (int i = 0; i < keys.length; i++)
   numFiles += Integer.parseInt( stats.get(keys[i]) );

//*-- Read the NGramProcessLM model from a file
private  NGramProcessLM readModel(String filename) throws ClassNotFoundException, IOException
   //*--- create object input stream from file
   BufferedInputStream bufIn = new BufferedInputStream(new FileInputStream(new File(filename)));
   ObjectInputStream objIn = new ObjectInputStream(bufIn);

   //*-- read the spell checker
   NGramProcessLM nLM = NGramProcessLM.readFrom(objIn);
   // close the resources and return result
   objIn.close(); bufIn.close();
public IndexWriter getIw()
{ return iw; }

public void setIw(IndexWriter iw)
{ this.iw = iw; }

public synchronized boolean isEnoughResources()
{ return enoughResources; }

public synchronized void setEnoughResources(boolean enoughMemory)
{ this.enoughResources = enoughMemory; }

public int totalNumFiles() { return filePos.length; }

public int getCurrentDoc(int threadNum)
{ return (currentDoc[threadNum]); }

public void setCurrentDoc(int threadNum, int docnum)
{ currentDoc[threadNum] = docnum; }

public TrainSpellChecker getSc()
{ return sc; }
public void setSc(TrainSpellChecker sc)
{ = sc; }
long initTime = 0;
long fileReadTime = 0;

