Source Code of org.sf.mustru.crawl.CrawlManager

package org.sf.mustru.crawl;


import java.io.BufferedInputStream;
import java.io.BufferedOutputStream;
import java.io.File;
import java.io.FileInputStream;
import java.io.FileNotFoundException;
import java.io.FileOutputStream;
import java.io.IOException;
import java.io.ObjectInputStream;
import java.io.ObjectOutputStream;
import java.io.RandomAccessFile;
import java.lang.RuntimeException;
import java.util.Date;
import java.util.Enumeration;
import java.util.ArrayList;
import java.util.Properties;


import org.sf.mustru.docs.IndexableDoc;
import org.sf.mustru.docs.SecKeyDoc;
import org.sf.mustru.search.SearchSimilarity;
import org.sf.mustru.utils.Constants;
import org.sf.mustru.utils.DbTools;
import org.sf.mustru.utils.StandardBgramAnalyzer;
import org.sf.mustru.utils.StandardBgramTokenizerFactory;
import org.sf.mustru.utils.StringTools;
import org.sf.mustru.utils.TrainSpellChecker;


import org.apache.log4j.Logger;
import org.apache.lucene.analysis.Analyzer;
import org.apache.lucene.index.IndexWriter;
import org.apache.lucene.store.FSDirectory;
import org.eclipse.core.internal.runtime.HashMapOfString;


import com.aliasi.lm.NGramProcessLM;
import com.aliasi.spell.FixedWeightEditDistance;
import com.aliasi.tokenizer.TokenizerFactory;


/**
 * Manage the crawl <br>
 * <ol>
 *  <li> Start the crawl - open the index and databases <br>
 *  <li> Read the task file from crawlTask and start the individual crawl threads <br>
 *  <li> Wait for the crawl threads to end and dump statistics <br>
 * </ol>
 */
public class CrawlManager
{
 static Logger logger = Logger.getLogger(CrawlManager.class.getName());
 private static CrawlManager ctRef = null;


 //*-- timers and statistics
 private HashMapOfString stats = new HashMapOfString();  //*-- a hash to keep track of statistics by media type
 private String[] timers = Constants.getTIMERS();    //*-- a list of timers
 private HashMapOfString timerHash = new HashMapOfString();  //*-- hash to keep track of timers of various events
 private String[] docTypes = null;        //*-- types of documents that will be processed


 //*-- Lucene vars
 private IndexWriter iw = null;      //*-- Filesystem based indexWriter object for Lucene
 private FSDirectory fsd;      //*-- Filesystem directory to store the index
 private Analyzer analyzer;      //*-- tokenizer for the search engine


 //*-- Lingpipe vars
 private TrainSpellChecker sc = null;     //*-- spell checker from Lingpipe
 private final static int NGRAM_LENGTH = 5;     private final static double MATCH_WEIGHT = -0.0;
 private final static double DELETE_WEIGHT = -4.0;   private final static double INSERT_WEIGHT = -1.0;
 private final static double SUBSTITUTE_WEIGHT = -2.0;   private final static double TRANSPOSE_WEIGHT = -2.0;
 
 //*-- vars for the task file
 private RandomAccessFile taskFile = null;  //*-- handle to read the task file
 private long[] filePos = null;      //*-- list of file positions in task file
 private int[] currentDoc = null;    //*-- current document positions in each thread
 private boolean enoughResources;    //*-- flag to indicate a thread ran out of memory


 /**
  * Create a single instance of this class
  */
 private CrawlManager() {  }
 public static CrawlManager getCrawlManager() throws NullPointerException
 { if (ctRef != null) 
     { logger.warn("Cannot run two instances of CrawlManager"); 
       throw new RuntimeException("Cannot run two instances of CrawlManager"); }


   //*-- instantiate a new crawl
   ctRef = new CrawlManager();
   if (Constants.getDbt() == null)
   { DbTools dbt = new DbTools(); 
     dbt.openEnv(Constants.getDBDIR(), false); Constants.setDbt(dbt);
   }
  return(ctRef);
 }


 public static void resetCrawlManager() { ctRef = null; }


 /**
  * Initialize the crawl. <br><br>
  *  
  * 1. Create a Lucene IndexWriter <br>
  * 2. Create the Berkeley databases <br>
  * 3. Initialize a timer hash <br>
  * 
  * @param crawlConfig  Crawl configuration object
  */
 public void initCrawl(CrawlConfig crawlConfig)
 {
  boolean freshCrawl = crawlConfig.isFreshCrawl();
  int numThreads = crawlConfig.getNumThreads();
  initTime =- new Date().getTime();


  Properties props = new Properties();
  try { props.load(new BufferedInputStream(new FileInputStream(Constants.DOCTYPES_FILE))); }
  catch ( IOException e) { ctRef.cleanUp("Could not open " + Constants.DOCTYPES_FILE + " " + e.getMessage()); }


  //*-- load the types of media from the docTypes properties file
  docTypes = new String[props.size()]; int i = 0; Integer integerZero = new Integer(0);
  for (Enumeration keys = props.propertyNames(); keys.hasMoreElements(); )
  { String key = (String) keys.nextElement(); stats.put(key, integerZero.toString() );
  docTypes[i++] = key; }
  stats.put("duplicates", integerZero.toString() ); 
  stats.put("indexed earlier", integerZero.toString() );


  //*-- create the single filesystem based Lucene IndexWriter
  //*-- FreshIndex value :   -1  = freshCrawl
  //*--         0  = false
  //*--         *  = true
  boolean freshIndex = (crawlConfig.getFreshIndex() == -1) ? freshCrawl: 
         (crawlConfig.getFreshIndex() ==  0) ? false: true;
  try  
  { 
   fsd = FSDirectory.getDirectory(new File(Constants.getINDEXDIR()), freshIndex); 
   analyzer = new StandardBgramAnalyzer(); iw = new IndexWriter(fsd, analyzer, freshIndex); 
   iw.setSimilarity(new SearchSimilarity());
   ctRef.setIw(iw); 
  }
  catch (IOException ie) { ctRef.cleanUp("Could not get IndexWriter " + ie.getMessage() ); }


  //*-- create the spell checker
  if (crawlConfig.isSpellCheck())
  { FixedWeightEditDistance fixedEdit = new FixedWeightEditDistance( MATCH_WEIGHT, DELETE_WEIGHT, INSERT_WEIGHT, 
      SUBSTITUTE_WEIGHT, TRANSPOSE_WEIGHT);
    NGramProcessLM lm = new NGramProcessLM(NGRAM_LENGTH);
    TokenizerFactory tokenizerFactory = new StandardBgramTokenizerFactory(false);  //*-- do not extract entities
    try { if ( (new File(Constants.SPELL_TRAIN_MODEL).exists() ) && !crawlConfig.isFreshCrawl() ) 
             lm = readModel(Constants.SPELL_TRAIN_MODEL);  
          sc = new TrainSpellChecker(lm, fixedEdit, tokenizerFactory); }
    catch (IOException ie) { logger.error("IO Error: Could not read spell train file " + ie.getMessage()); }
    catch (ClassNotFoundException ce) { logger.error("Class error: " + ce.getMessage()); }
  }
  
  //*-- Create the databases, if necessary
  DbTools dbt = Constants.getDbt();
  boolean createFlag = true; boolean dupFlag = false;  
  if (freshCrawl) 
  { 
   //*-- initialize and create a new database for the list of extracted files and a secondary 
   //*-- database for the file signature
   dbt.dropDB(Constants.EXT_FILES_DB); 
   dbt.createDB(Constants.EXT_FILES_DB, createFlag, dupFlag); 
   logger.info("Created py. database " + Constants.EXT_FILES_DB);
   dbt.dropSecDB(Constants.EXT_FILES_SECDB);
   SecKeyDoc skd = new SecKeyDoc( new IndexableDoc().getBdbBinding() );
   if (dbt.createSecDB(Constants.EXT_FILES_SECDB, false, skd)) 
    logger.info("Created sec. database " + Constants.EXT_FILES_SECDB);
   dbt.closeSecDB(); dbt.closeDB();


   //*-- create the messages database
   dbt.dropDB(Constants.EXT_MESSAGES_DB); 
   dbt.createDB(Constants.EXT_MESSAGES_DB, createFlag, dupFlag); 
   logger.info("Created py. database " + Constants.EXT_MESSAGES_DB);
   dbt.closeDB();


  }


  //*-- initialize the timer hash
  timers = Constants.getTIMERS();
  for (int j = 0; j < timers.length ; j++)
   for (int k = 0; k < numThreads; k++)
    timerHash.put(timers[j] + "_" + k, " ");


  initTime += new Date().getTime();  
 }


 /**
  *  Scan the contents of the task file. Each line contains the path to an indexable document
  *  on the filesystem. Use the thread number and number of threads to distribute the load evenly
  *  across all threads. The text for each file is extracted, classified, and indexed.
  *
  * @param crawlConfig Crawl configuration object
  * @param crawlThread List of index doc threads
  */
 public void startThreads(CrawlConfig crawlConfig, CrawlThread[] crawlThread)
 { 
  //*-- read the list of files to process into an array
  int numThreads = crawlConfig.getNumThreads();  
  ArrayList<Long> fileList = new ArrayList<Long>(); 
  try
  { taskFile = new RandomAccessFile(Constants.TASK_FILE, "r"); taskFile.seek(0);
    int i = 0; fileList.add(i, new Long( taskFile.getFilePointer() ) );
    String fileName = "";
    LOOP: while ((fileName = taskFile.readLine()) != null) 
    { if ( (crawlConfig.getStartPosition() <= 0) && (!(new File(fileName).canRead())) ) 
        continue LOOP;  //*-- make sure that the file is readable before adding to the list
      i++;
      fileList.add(i, new Long( taskFile.getFilePointer() ) );
    }
  }
  catch (FileNotFoundException fe) 
  { cleanUp("Could not find task file " + Constants.TASK_FILE + " " + fe.getMessage());  }
  catch (IOException ie) 
  { cleanUp("Could not read task file " + Constants.TASK_FILE + " " + ie.getMessage());  }


  //*-- set the file positions for each file in the task file
  filePos = new long[fileList.size() - 1];
  for (int i = 0; i < filePos.length; i++) filePos[i] = ( (Long) fileList.get(i)).longValue();
  fileList = null;


  //*-- set the Lucene parameter for buffering documents
  iw.setMaxBufferedDocs( (totalNumFiles() > 9) ? Constants.LUCENE_MAX_BUFFERED_DOCS: 2);


  //*-- initially, set the current document positions and passage counts
  if (currentDoc == null)
  { currentDoc = new int[numThreads];
    for (int i = 0; i < currentDoc.length; i++) currentDoc[i] = -1;
  }


  //*-- Start threads to process documents in the task file, stagger the start of threads


  for (int i = 0; i < numThreads; i++)
  {  
   try { crawlThread[i] = new CrawlThread (i, crawlConfig, this); }
   catch (IOException ie) { cleanUp("Could not create independent threads " + ie.getMessage() ); }    
   crawlThread[i].start(); 
   try { Thread.sleep(100); } catch (InterruptedException e) { }
  }


  return;
 } //*-- end of startThreads


 /**
  * Increment the log to track the number and type of files processed
  * @param docType
  */
 public synchronized void updateLogdata (String docType) 
 {
  if (docType == null) docType = "unknown";
  Integer count = Integer.parseInt(stats.get(docType) );
  if ( count == null) { docType = "unknown"; count = Integer.parseInt(stats.get(docType)); }
  count =  count + 1;
  stats.put(docType, count.toString());
 } //*-- end of update log


 /**
  * Dump the log to a string
  * @return String containing the statistics for the crawl
  */
 public String dumpLogdata(int numThreads)
 {
  StringBuffer logData = new StringBuffer(); int numFiles = 0;
  Integer io; String newLine = Constants.NEWLINE;
  logData.append("----------------------------------------------" + newLine);
  for (int i = 0; i < docTypes.length; i++)
  { io =   Integer.parseInt( stats.get( docTypes[i] ) );  
  logData.append("No. of " + docTypes[i] + " files: " + io.intValue() + newLine);
  numFiles += io.intValue();
  }
  io = Integer.parseInt(stats.get("duplicates") ); 
  logData.append("No. of duplicates: " + io.intValue() + newLine); 
  numFiles += io.intValue();
  io = Integer.parseInt(stats.get("indexed earlier") ); 
  logData.append("No. indexed earlier: " + io.intValue() + newLine); 
  numFiles += io.intValue(); 


  logData.append("----------------------------------------------" + newLine);
  logData.append("Total no. of files: " + numFiles + newLine);
  return (logData.toString());
 } //*-- end of dump log




 /**
  * Called at the end of shutdown
  * @param msg
  */
 public synchronized void cleanUp(String msg)
 { 
  //*-- clean up the Lucene index
  try { if (iw != null) 
         { logger.info("Optimizing Lucene index..."); 
           //iw.optimize();
           iw.close(); } }
  catch (IOException ie) { logger.error("Could not optimize Lucene index " + ie.getMessage() ); }


  //*-- close the task file
  try { if (taskFile != null) taskFile.close(); }
  catch (IOException ie) {  }


  if (Constants.getDbt() != null) Constants.getDbt().closeEnv();
  
  //*-- dump the spell checker model
  if (sc != null)
  { try { sc.pruneTokens(5); 
          BufferedOutputStream bufOut = new BufferedOutputStream( new FileOutputStream(Constants.SPELL_CHECK_MODEL));
          ObjectOutputStream objOut = new ObjectOutputStream(bufOut); sc.compileTo(objOut);
          objOut.close(); bufOut.close(); 
          
          bufOut = new BufferedOutputStream( new FileOutputStream(Constants.SPELL_TRAIN_MODEL));
          objOut = new ObjectOutputStream(bufOut); sc.dumpTo(objOut);
          objOut.close(); bufOut.close();        
        }
    catch (IOException ie) { logger.error("IO Error: " + ie.getMessage()); }
  }
  
  if (!msg.equals("")) 
  { logger.error(msg); logger.error("ERROR: This thread was aborted");  }
  resetCrawlManager();
  return;
 }


 public void dumpTimers(int numThreads)
 {
  //*-- dump the headers
  System.out.println("");
  System.out.println("Profile of Crawl");
  System.out.println(""); System.out.print("Timer Type\t\t");


  for (int i = 0; i < numThreads; i++) System.out.print( StringTools.fillin("Thread " + i,  10, true, ' ', 2) );
  System.out.println(""); 
  System.out.println("-------------------------------------------------------------------------------------------------------");


  //*-- dump the timers
  for (int i = 0; i < timers.length; i++) 
  { 
   System.out.print(StringTools.fillin(timers[i], 22, true, ' ', 22 - timers[i].length() ) );
   for (int j = 0; j < numThreads; j++)
   { String timeT = (String) timerHash.get(timers[i] + "_" + j);
   System.out.print( StringTools.fillin( timeT, 10, false, ' ', 10 - timeT.length() ) ); }
   System.out.println("");
  }


  System.out.println("");
  System.out.println(""); System.out.print("Timer Type\t\t");


  System.out.print( StringTools.fillin("Main Thread",  15, true, ' ', 2) );
  System.out.println(""); System.out.println("-----------------------------------------------------");


  //*-- dump the timers
  System.out.print(StringTools.fillin("initTime", 22, true, ' ', 22 - 8) );
  System.out.println(initTime); 
  System.out.print(StringTools.fillin("fileReadTime", 22, true, ' ', 22 - 12) );
  System.out.println(fileReadTime);   
 }


 /**
  *  Terminate all threads
  * @param crawlThread
  */
 public void endThreads (CrawlThread[] crawlThread)
 { 
  //*-- signal all threads to stop running
  for (int i = 0; i < crawlThread.length; i++) crawlThread[i].endThread();


  //*-- wait till all threads complete...
  LOOP: for (int i = 0; i < crawlThread.length; i++)
  { int numTimes = 0;
    while ( (crawlThread[i] != null) && crawlThread[i].isAlive() ) 
    { logger.info("Waiting for indexing thread to terminate...");    
      try { Thread.sleep(3000); } catch (InterruptedException ie) { }
      if (numTimes++ < 100) continue LOOP;
      crawlThread[i] = null;
    } //*-- end of while
  } //*-- end of for




 }


 /**
  * Crawl Threads call this function to update the timers 
  * @param threadNum Integer thread number
  * @param timerType String type of timer
  * @param timerval long time in msecs.
  */
 public synchronized void updateTimers(int threadNum, String timerType, long timerval)
 {  String key = timerType + "_" + threadNum;
    timerHash.put(key, Long.toString(timerval) ); }


 /**
  * return the file name at the index position in the task file
  * @param index line position in the task file
  * @return String
  * @throws IOException
  */
 public String getFileName(int index) throws IOException
 { if (taskFile == null) return ("");
   taskFile.seek(filePos[index]); String fileName = taskFile.readLine();
   return ( (fileName == null) ? "": fileName.trim() );
 }


 /**
  * return the number of files processed so far
  */
 public int getTotalFilesProcessed()
 {
  String[] keys = stats.keys(); int numFiles = 0;
  for (int i = 0; i < keys.length; i++)
   numFiles += Integer.parseInt( stats.get(keys[i]) );
  return(numFiles);
 }


 //*-------------------------------------------------------------
 //*-- Read the NGramProcessLM model from a file
 //*-------------------------------------------------------------
 private  NGramProcessLM readModel(String filename) throws ClassNotFoundException, IOException 
  {
   //*--- create object input stream from file
   BufferedInputStream bufIn = new BufferedInputStream(new FileInputStream(new File(filename)));
   ObjectInputStream objIn = new ObjectInputStream(bufIn);


   //*-- read the spell checker
   NGramProcessLM nLM = NGramProcessLM.readFrom(objIn);
   
   // close the resources and return result
   objIn.close(); bufIn.close();
 
   return(nLM);
 }
 
 public IndexWriter getIw() 
 { return iw; }


 public void setIw(IndexWriter iw) 
 { this.iw = iw; }


 public synchronized boolean isEnoughResources() 
 { return enoughResources; }


 public synchronized void setEnoughResources(boolean enoughMemory) 
 { this.enoughResources = enoughMemory; }


 public int totalNumFiles() { return filePos.length; }


 public int getCurrentDoc(int threadNum)
 { return (currentDoc[threadNum]); }


 public void setCurrentDoc(int threadNum, int docnum)
 { currentDoc[threadNum] = docnum; }


 public TrainSpellChecker getSc()
 { return sc; }
 
 public void setSc(TrainSpellChecker sc)
 { this.sc = sc; }
 
 long initTime = 0;
 long fileReadTime = 0;
 
}
Source Code of org.sf.mustru.crawl.CrawlManager

Related Classes of org.sf.mustru.crawl.CrawlManager