Package org.sf.mustruweb

Source Code of org.sf.mustruweb.Search

package org.sf.mustruweb;

import java.beans.IntrospectionException;
import java.beans.Introspector;
import java.beans.PropertyDescriptor;
import java.io.FileInputStream;
import java.io.IOException;
import java.io.StringReader;
import java.lang.reflect.InvocationTargetException;
import java.lang.reflect.Method;
import java.text.DateFormat;
import java.text.NumberFormat;
import java.util.ArrayList;
import java.util.Date;
import java.util.Properties;
import java.util.regex.Matcher;
import java.util.regex.Pattern;

import javax.servlet.RequestDispatcher;
import javax.servlet.ServletException;
import javax.servlet.http.HttpServlet;
import javax.servlet.http.HttpServletRequest;
import javax.servlet.http.HttpServletResponse;

import org.apache.log4j.Logger;
import org.apache.log4j.PropertyConfigurator;

import org.apache.lucene.analysis.Analyzer;
import org.apache.lucene.analysis.Token;
import org.apache.lucene.analysis.TokenStream;
import org.apache.lucene.analysis.standard.StandardAnalyzer;
import org.apache.lucene.document.Document;
import org.apache.lucene.queryParser.ParseException;
import org.apache.lucene.queryParser.QueryParser;
import org.apache.lucene.search.Hits;
import org.apache.lucene.search.Query;
import org.apache.lucene.search.Searcher;
import org.apache.lucene.search.highlight.Fragmenter;
import org.apache.lucene.search.highlight.Highlighter;
import org.apache.lucene.search.highlight.QueryScorer;
import org.apache.lucene.search.highlight.SimpleFragmenter;
import org.apache.lucene.search.highlight.SimpleHTMLFormatter;

import org.sf.mustru.crawl.CrawlConfig;
import org.sf.mustru.docs.IndexableDoc;
import org.sf.mustru.search.SearchQuery;
import org.sf.mustru.search.SearchQuestion;
import org.sf.mustru.search.SearchTools;
import org.sf.mustru.utils.Constants;
import org.sf.mustru.utils.DbTools;
import org.sf.mustru.utils.StringTools;

import com.sleepycat.je.DatabaseEntry;

/**
* Accept search parameters from the client and generate a response.
* If a question, then use the question search function, otherwise
* run a standard search
*/

public final class Search extends HttpServlet
{
static final long serialVersionUID = 7225456138146845275L;
private String q;        //*-- query string
private SearchQuery sQuery;      //*-- Submit a query and get the list of hits
private SearchQuestion sQuestion;    //*-- Submit a question and get the list of hits
private Searcher is;        //*-- Index searcher
private DbTools dbt;        //*-- Berkeley DB handle
private Pattern[] nounPatterns;    //*-- noun patterns to match in response to a question
private Logger logger;      //*-- Log4j logger
Properties dprops = null;      //*-- document properties file
private static Pattern trimPattern = Pattern.compile("(_\\d+?)$");

/**
  * Load some initial data, set the index/database directories
  */
public void init()
{
 
  PropertyConfigurator.configure (Constants.LOG4J_FILE);
  logger = Logger.getLogger(Search.class.getName());

  //*-- set the index and database directories
  CrawlConfig crawlConfig = new CrawlConfig(false);   //*-- initialize from the properties file
  Constants.setDBDIR( crawlConfig.getDbDir() ); Constants.setINDEXDIR( crawlConfig.getIndexDir() );

  //*-- Create the Berkeley DB environment for read only access
  dbt = new DbTools(); dbt.openEnv(Constants.getDBDIR(), true);
  Constants.setDbt(dbt);
 
  try { is = SearchTools.getSearcher(Constants.getINDEXDIR(), false); }
  catch (IOException ie) { logger.error("IO Error " + ie.getMessage() ); }

  sQuestion = new SearchQuestion(); sQuery = new SearchQuery();
  try { dprops = new Properties(); dprops.load(new FileInputStream(Constants.DOCTYPES_FILE)); }
  catch (IOException e)  { logger.error("Could not read " + Constants.DOCTYPES_FILE + " " + e.getMessage()); }
}

/**
  * Respond to a GET request from client
  *
  * @param request Received from client
  * @param response Response to send to the client
  *
  * @exception IOException for an I/O error
  * @exception ServletException for servlet error
  */
public void doGet(HttpServletRequest request, HttpServletResponse response) throws ServletException
{
  //*-- extract the query / question string
  q = request.getParameter("q");

  //*-- redirect blank queries to initial page
  if (q.equals(""))
  { RequestDispatcher dispatcher = request.getRequestDispatcher("index.html");
    try { dispatcher.forward(request, response); }
    catch (IOException ie) { logger.error("IO Error: Could not forward to index.html: " + ie.getMessage()); }
    return;
  }

  //*-- Fetch the hit list depending on the type of request - query, question, query expansion, or a similar document
  Hits hits = null; String requestType = request.getParameter("type"  );
  try
  { if (requestType.equals("SEARCH")) hits = sQuery.getHits(q);
    else if (requestType.equals("ASK")) hits =  sQuestion.getHits(q);
    else if (requestType.equals("EXPAND")) hits = is.search( SearchTools.expandQuery(q) );
    else if (requestType.equals("SIMILAR")) hits = is.search( SearchTools.similarDocs(request.getParameter("dockey"), dbt));
  }
  catch (IOException ie) { logger.error("IO Error: Could not fetch hits" + ie.getMessage()); }
 
  //*-- create the common bean containing the number of hits and paging information
  CommonBean cbean = createCbean(request, hits); request.setAttribute("cbean", cbean);
 
  //*-- generate the document or answer bean for queries and questions respectively
  try
  if ( (requestType.equals("ASK")) && (sQuestion != null) ) 
       { AnswerBean abean = createAbean(hits, cbean);  request.setAttribute("abean", abean); }
     else  { DocBean dbean = createDbean(hits, cbean);  request.setAttribute("dbean", dbean);
  } //*-- end of try
  catch (IOException ie) { logger.error("IO Error: Could not create doc/answer bean: " + ie.getMessage()); }

  //*-- pass control to the appropriate JSP
  String address = (requestType.equals("ASK")) ? "/WEB-INF/answers.jsp": "/WEB-INF/documents.jsp";
  RequestDispatcher dispatcher = request.getRequestDispatcher(address);
  try { dispatcher.forward(request, response); }
  catch (IOException ie) { logger.error("IO Error: Could not forward request: " + ie.getMessage()); }
}

//*------------------------------------------------------------
//*- Return a common bean filled with values based on the hits object
//*------------------------------------------------------------
private CommonBean createCbean(HttpServletRequest request, Hits hits)
{
  CommonBean cbean = new CommonBean();
 
  //*-- build the fields for the common bean
  int hitsLen = (hits != null) ? hits.length(): 0;
  //cbean.setQ(q.replaceAll("\"", """));
  cbean.setQ(q);
  cbean.setNumHits(hitsLen);
  cbean.setAlt(sQuery.getAlt());
 
  //*-- set the page increment
  int pageInc = (request.getParameter("pageInc") == null) ? 10: Integer.parseInt(request.getParameter("pageInc"));
  cbean.setPageInc(pageInc);
 
  //*-- set the total no. of pages
  int numPages = hitsLen / pageInc; numPages += ( (hitsLen % pageInc) == 0) ? 0: 1;
  cbean.setNumPages(numPages);

  //*-- set the start page
  int startPage = (request.getParameter("startPage")== null) 0: Integer.parseInt(request.getParameter("startPage"));
  if (startPage > numPages) startPage = 0;
  cbean.setStartPage(startPage);
 
  //*-- set the current page
  int currentPage = (request.getParameter("currentPage") == null) ? 0: Integer.parseInt(request.getParameter("currentPage"));
  if (currentPage > numPages) currentPage = 0;
  cbean.setCurrentPage(currentPage);
 
  //*-- set start/end index for the hit list
  int start = currentPage * pageInc;
  cbean.setStart(start);
  int end = ( (start + pageInc) < hitsLen ) ? start + pageInc: hitsLen; cbean.setEnd(end);
 
  //*-- set the type of question / query
  cbean.setType(request.getParameter("type"));
  return cbean;
}
//*---------------------------------------------------------
//*-- Create an answer bean containing the list of answers
//*---------------------------------------------------------
private AnswerBean createAbean(Hits hits, CommonBean cbean) throws IOException
{
  AnswerBean abean = new AnswerBean();
  if (hits == null)  { abean.setAnswers(null); return (abean); }
  //*-- build the list of answers
  ArrayList<Answer> alist = new ArrayList<Answer>();
  Matcher matcher = null;
  dbt.openDB(Constants.EXT_FILES_DB, true, false); //*-- read only access

  //*-- allocate space for the digests of the passages
  String[] qnouns = sQuestion.getNouns();
  nounPatterns = new Pattern[qnouns.length];
  for (int i = 0; i < qnouns.length; i++)
    nounPatterns[i] = Pattern.compile(qnouns[i], Pattern.CASE_INSENSITIVE);
  ArrayList<String> digests = new ArrayList<String>();
  int numPageHits = 0; int inc = 0;
  LOOP: for (int i = cbean.getStart(); i < hits.length(); i+= inc)
  {
   //*-- use the key to fetch the matching database entry
   Document doc = hits.doc(i);
   String key = doc.get("key");
   DatabaseEntry data = new DatabaseEntry();            
   if (!dbt.fetch(key, data)) continue LOOP;

   //*-- extract the passage text
   IndexableDoc idoc = new IndexableDoc();
   idoc = (IndexableDoc) idoc.getBdbBinding().entryToObject(data);
   String line= idoc.getContents().toString();
   String passage[] = SearchTools.bestPassages(line, sQuestion.getNouns(), sQuestion.getVerbs(), sQuestion.getAdjectives(),
                     sQuestion.getBigrams(), sQuestion.getEntities());
  
   //*-- get the file name
   matcher = trimPattern.matcher(key); if (matcher != null) key = matcher.replaceFirst("");

   //*-- check for dups from the top two sentences
   inc = 0;
   for (int j = 0; j < 2; j++)
   {
    String digest = StringTools.shaDigest(passage[j]);
    if (digests.indexOf(passage[j]) != -1) continue LOOP;
    digests.add(digest);

    Answer answer = new Answer();
    answer.setFileName(key);
    answer.setPassage(hiliter(passage[j], sQuestion.getNouns()) );
    answer.setScore(hits.score(i));
    alist.add(answer);
    inc++;
   }
  
   if (++numPageHits >= cbean.getPageInc()) break LOOP; 
  } //*-- end of for

  //*-- set the answers array in the bean and return
  Answer[] answers = new Answer[alist.size()]; alist.toArray(answers);
  abean.setAnswers(answers);
  dbt.closeDB();

  return(abean);
}

//*---------------------------------------------------------
//*-- Create a document bean containing the list of document hits
//*---------------------------------------------------------
private DocBean createDbean(Hits hits, CommonBean cbean) throws IOException
{
  DocBean dbean = new DocBean();
  if (hits == null) { dbean.setDocuments(null); return(dbean); }
 
  DateFormat df = DateFormat.getDateInstance(DateFormat.MEDIUM, Constants.locale);
  NumberFormat nf = NumberFormat.getInstance(Constants.locale);
  nf.setMaximumFractionDigits(1); nf.setMinimumFractionDigits(1);

  //*-- build the list of documents
  ArrayList<Doc> dlist = new ArrayList<Doc>();
  int numPageHits = 0;
  LOOP: for (int i = cbean.getStart(); i < hits.length(); i++)
  {
   //*-- use the key to fetch the matching database entry
   Document doc = hits.doc(i);

   String key = doc.get("key"); String ftype = doc.get("type");
   DatabaseEntry data = new DatabaseEntry();

   String dbname = (ftype.equalsIgnoreCase("email")) ? Constants.EXT_MESSAGES_DB: Constants.EXT_FILES_DB;

   dbt.openDB(dbname, true, false); //*-- open the database for read only access            
   if (!dbt.fetch(key, data)) continue LOOP;
   dbt.closeDB();
  
   //*-- use the type of the document to create a doc instance of the specified type
   try
   {
    Doc document = new Doc();
    String docClass = dprops.getProperty(ftype);
    if ( (docClass == null) || (docClass.equals("")) ) docClass = "org.sf.mustru.docs.TextDoc";
    Class docType = Class.forName(docClass);
    IndexableDoc idoc =  (IndexableDoc) docType.newInstance();
    idoc = (IndexableDoc) idoc.getBdbBinding().entryToObject(data)
  
    //*-- set the snippet of the document for the bean
    String contents = idoc.getContents().toString(); contents = StringTools.filterChars(contents);
    if (contents.length() > 5000) contents = contents.substring(0, 5000);
    idoc.setContents(new StringBuffer(snippet(contents)));
      
    //*-- handle the different types of documents     
    copyBean(idoc, document);
    document.setFormatFileLength(nf.format(document.getFileLength() / 1000.0) );
    document.setFormatMdate( df.format( new Date(document.getMdate()) ) );
    document.setScore(hits.score(i));
    dlist.add(document);
    if (++numPageHits >= cbean.getPageInc()) break LOOP; 
   }
   catch (ClassNotFoundException ce)
   { logger.error("Could not get doc class : " + ce.getMessage()); }
   catch (InstantiationException ie)
   { logger.error("Could not instantiate doc class: " + ie.getMessage() ); }
   catch (IllegalAccessException ae)
   { logger.error("Could not access class: " + ae.getMessage()); }

  } //*-- end of for

  //*-- set the answers array in the bean and return
  Doc[] docs = new Doc[dlist.size()]; dlist.toArray(docs);
  dbean.setDocuments(docs);

  return(dbean);
}

//*--------------------------------------------------
//*-- copy values from one bean to another bean
//*--------------------------------------------------
private void copyBean (Object source, Object target
  {  
   PropertyDescriptor[]  sourceProperties = null;
   PropertyDescriptor[]  targetProperties = null;
   try
    { sourceProperties = Introspector.getBeanInfo  (source.getClass()).getPropertyDescriptors();
      targetProperties = Introspector.getBeanInfo  (target.getClass()).getPropertyDescriptors();
   
   catch  (IntrospectionException ie )  {  logger.error("Introspection error " + ie.getMessage());
  
   try
   {
   Object[] value =  {null}
   for  ( int i = 0; i  <  sourceProperties.length; ++i
    { 
     String name = sourceProperties[i].getName();
     LOOP: for ( int j = 0; j  <  targetProperties.length; ++j
       if  ( targetProperties[j].getName().equals(name) ) 
        { 
         Method read = sourceProperties[i].getReadMethod();
         Method write = targetProperties[j].getWriteMethod();
         if  ( read == null || write == null break LOOP;
         value[0= read.invoke(source, (Object[]) null);
         write.invoke(target, value);
         break LOOP;
        //*-- end of inner for
    } //*-- end of outer for  
   }
   catch (IllegalAccessException ie) { logger.error("Illegal access error " + ie.getMessage()); }
   catch (InvocationTargetException ie) { logger.error("Invocation error: " + ie.getMessage()); }
  
   return;
  }
//*-------------------------------------------------
//*-  Return a string with query keywords highlighted
//*-------------------------------------------------
private String snippet(String text) throws IOException
{ return (snippet(text, null) ); }

private String snippet(String text, String[] stopWords) throws IOException
{
  //*-- first extract a list of tokens from the query string excluding stop words
  StandardAnalyzer sAnalyzer = (stopWords == null) ? new StandardAnalyzer(): new StandardAnalyzer(stopWords);
  Token[] tokens = extractTokens(sAnalyzer, q);

  //*-- next, build a new filtered query string from the list of tokens
  StringBuffer queryString = new StringBuffer();
  LOOP: for (int i = 0; i < tokens.length; i++)
  { if (tokens[i].termText().length() < 3) continue LOOP;
    queryString.append(tokens[i].termText());
    if (i < (tokens.length - 1) ) queryString.append(" OR ");
  }

  //*-- parse the query using the standard analyzer and highlight the text
  QueryParser qp = new QueryParser("contents", sAnalyzer );
  String result = "";
  try
  {
   if (queryString.length() > 0)
   { Query query = qp.parse(queryString.toString());
     QueryScorer qScorer = new QueryScorer(query);
     SimpleHTMLFormatter formatter = new SimpleHTMLFormatter("<span class=\"hlight\">", "</span>");
     Highlighter highlighter = new Highlighter(formatter, qScorer);
     Fragmenter fragmenter = new SimpleFragmenter(80);        //*-- use fragments of 50 bytes each
     highlighter.setTextFragmenter(fragmenter);
     TokenStream tokenStream = sAnalyzer.tokenStream("contents", new StringReader(text));
     result = highlighter.getBestFragments(tokenStream, text, 3, "...<br>")//*-- collect upto three fragments
   }
  }
  catch (ParseException pe) { logger.error("Query parse error " + pe.getMessage() ); }

  //*-- if no tokens were extracted, then return the original string
  return ( (result.length() == 0) ? StringTools.fillin(text, 250, true, '.', 3): result);

}

//*-----------------------------------------------------------
//*- Highlight text using the passed nouns
//*-----------------------------------------------------------
private String hiliter(String passage, String[] nouns)
{
  //*-- match the nouns in a loop and highlight the nouns
  LOOP: for (int i = 0; i < nouns.length; i++)
   { if (nouns[i].length() < 3) continue LOOP;
     String replace = "<span class=\"hlight\">" + nouns[i] + "</span>";
     passage = nounPatterns[i].matcher(passage).replaceAll(replace);
   }
  return(passage);
}
//*-----------------------------------------------------------
//*- Use the passed analyzer to get a list of tokens from the text
//*-----------------------------------------------------------
private static Token[] extractTokens(Analyzer analyzer, String text) throws IOException
{
  TokenStream stream = analyzer.tokenStream("contents", new StringReader(text));
  ArrayList<Token> tokenList = new ArrayList<Token>(); Token token = null;
  while ( (token = stream.next()) != null) tokenList.add(token);
  Token[] tokens = new Token[tokenList.size()];
  for (int i = 0; i < tokens.length; i++) tokens[i] = tokenList.get(i);
  return (tokens);
}

/**
  * Release resources before ending the servlet
  */
public void destroy()
{ sQuestion = null; sQuery = null; dbt.closeEnv(); }

}
TOP

Related Classes of org.sf.mustruweb.Search

TOP
Copyright © 2018 www.massapi.com. All rights reserved.
All source code are property of their respective owners. Java is a trademark of Sun Microsystems, Inc and owned by ORACLE Inc. Contact coftware#gmail.com.