Source Code of net.sf.regain.search.results.SearchResultsImpl

/*
 * regain - A file search engine providing plenty of formats
 * Copyright (C) 2004-2010 Thomas Tesche, Til Schneider
 *
 * This library is free software; you can redistribute it and/or
 * modify it under the terms of the GNU Lesser General Public
 * License as published by the Free Software Foundation; either
 * version 2.1 of the License, or (at your option) any later version.
 *
 * This library is distributed in the hope that it will be useful,
 * but WITHOUT ANY WARRANTY; without even the implied warranty of
 * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
 * Lesser General Public License for more details.
 *
 * You should have received a copy of the GNU Lesser General Public
 * License along with this library; if not, write to the Free Software
 * Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA  02111-1307  USA
 *
 * Contact: Thomas Tesche: www.thtesche.com, Til Schneider: info@murfman.de
 *
 * CVS information:
 *  $RCSfile$
 *   $Source$
 *     $Date: 2008-08-06 16:04:27 +0200 (Wed, 06 Aug 2008) $
 *   $Author: thtesche $
 * $Revision: 325 $
 */
package net.sf.regain.search.results;


import java.io.IOException;
import java.io.StringReader;
import java.util.ArrayList;
import java.util.Arrays;
import java.util.List;
import java.util.regex.Matcher;
import java.util.regex.Pattern;
import java.util.zip.DataFormatException;
import net.sf.regain.RegainException;
import net.sf.regain.RegainToolkit;
import net.sf.regain.search.IndexSearcherManager;
import net.sf.regain.search.SearchToolkit;
import net.sf.regain.search.access.SearchAccessController;
import net.sf.regain.search.config.IndexConfig;
import net.sf.regain.util.sharedtag.PageRequest;
import org.apache.commons.collections.Factory;
import org.apache.commons.collections.ListUtils;
import org.apache.lucene.analysis.Analyzer;
import org.apache.lucene.analysis.TokenStream;
import org.apache.lucene.document.CompressionTools;
import org.apache.lucene.document.Document;
import org.apache.lucene.document.Field;
import org.apache.lucene.index.IndexReader;
import org.apache.lucene.index.MultiReader;
import org.apache.lucene.index.Term;
import org.apache.lucene.queryParser.ParseException;
import org.apache.lucene.queryParser.QueryParser;
import org.apache.lucene.search.BooleanClause.Occur;
import org.apache.lucene.search.BooleanQuery;
import org.apache.lucene.search.IndexSearcher;
import org.apache.lucene.search.MultiSearcher;
import org.apache.lucene.search.Query;
import org.apache.lucene.search.ScoreDoc;
import org.apache.lucene.search.Searcher;
import org.apache.lucene.search.Sort;
import org.apache.lucene.search.TermQuery;
import org.apache.lucene.search.TopDocsCollector;
import org.apache.lucene.search.TopFieldCollector;
import org.apache.lucene.search.highlight.Highlighter;
import org.apache.lucene.search.highlight.QueryScorer;
import org.apache.lucene.search.highlight.SimpleHTMLFormatter;
import org.apache.lucene.util.Version;
import org.apache.regexp.RE;
import org.apache.regexp.RESyntaxException;


/**
 * Holds the results of a search on a single or multiple indexes. The class
 * uses the new Lucene API (2.9 and later).
 *
 * @author Thomas Tesche: www.thtesche.com
 */
public class SearchResultsImpl implements SearchResults {


  /** The searcher (single or multi). */
  private Searcher mIndexSearcher;
  /** The reader (multi). */
  private MultiReader mMultiReader;
  /** The Query text. */
  private String mQueryText;
  /** The time the search took. */
  private int mSearchTime;
  /** The index name. In case of an single index it's exactly this name and 'multiindex' in other case. */
  private String mIndexName;
  /** The hits of this search. */
  private ScoreDoc[] hitScoreDocs;
  /** The DocCollector. */
  TopDocsCollector topDocsCollector;
  private static Pattern mimetypeFieldPattern = Pattern.compile("(mimetype:\".*\")");
  /**
   * Der Reguläre Ausdruck, zu dem eine URL passen muss, damit sie in einem
   * neuen Fenster geöffnet wird.
   */
  private RE mOpenInNewWindowRegex;
  /** The boolean query used while searching and highlighting */
  private BooleanQuery mQuery;
  /** The current analyzer */
  private Analyzer mAnalyzer;
  /** The current config. */
  private IndexConfig mIndexConfig;
  /** Factory for create a new LazyList-entry. */
  Factory factory = new Factory() {


    @Override
    public Object create() {
      return new Document();
    }
  };
  /** held the transformed hits. */
  private List lazyHitList = ListUtils.lazyList(new ArrayList(), factory);


  /**
   * Creates an instanz of SearchResults. This class can search over a single
   * or multiple indexes.
   *
   * @param indexConfig the array of index configs
   * @param queryText The query text to search for.
   * @param groupArr The groups the searching user has reading rights for.
   *        See {@link net.sf.regain.search.access.SearchAccessController}.
   *        Is <code>null</code>, if no access control should be used.
   *
   * @throws RegainException If searching failed.
   */
  public SearchResultsImpl(IndexConfig[] indexConfigs, PageRequest request)
          throws RegainException {


    long startTime = System.currentTimeMillis();
    ArrayList<String> groupsArr = new ArrayList<String>();
    IndexSearcherManager[] indexSearcherManagers = new IndexSearcherManager[indexConfigs.length];


    mQueryText = SearchToolkit.getSearchQuery(request);
    //System.out.println("Initial query: " + mQueryText);


    String mimeTypeFieldText = null;
    String queryText = null;


    if (mQueryText != null) {
      // Remove the mimetype field if the query contains it


      Matcher matcher = mimetypeFieldPattern.matcher(mQueryText);
      boolean found = matcher.find();
      if (found && matcher.groupCount() > 0) {
        // the first group is the mimetype field identifier
        mimeTypeFieldText = matcher.group(1);
        queryText = mQueryText.replace(mimeTypeFieldText, "");
        //System.out.println("Query after mimetype removing: " + queryText);


      } else {
        queryText = mQueryText;
      }
    }


    // If there is at least on index
    if (indexConfigs.length >= 1) {


      for (int i = 0; i < indexConfigs.length; i++) {
        // Get the groups the current user has reading rights for
        String[] groupArr = null;
        SearchAccessController accessController = indexConfigs[i].getSearchAccessController();
        if (accessController != null) {
          groupArr = accessController.getUserGroups(request);
          // Check the Group array
          RegainToolkit.checkGroupArray(accessController, groupArr);
          groupsArr.addAll(Arrays.asList(groupArr));
        }
        // build composed result: all groups over all indeces the user has the rights.
        // find the IndexSearcherManager for every index
        indexSearcherManagers[i] = IndexSearcherManager.getInstance(indexConfigs[i].getDirectory());


      }


      String[] allGroups = groupsArr.toArray(new String[0]);


      // Decide whether to use IndexSearcher (only one index) or MultiSearcher
      if (indexSearcherManagers.length == 1) {
        //System.out.println("SingleSearcher");
        mIndexSearcher = indexSearcherManagers[0].getIndexSearcher();
        mAnalyzer = indexSearcherManagers[0].getAnalyzer();
        mIndexName = indexConfigs[0].getName();
        IndexReader[] readerArray = {indexSearcherManagers[0].getIndexReader()};
        mMultiReader = new MultiReader(readerArray);


      } else {
        // Collect all IndexSearchers and instantiate a MultiSearcher
        IndexSearcher[] searchers = new IndexSearcher[indexConfigs.length];
        IndexReader[] readerArray = new IndexReader[indexConfigs.length];
        for (int j = 0; j < indexSearcherManagers.length; j++) {
          searchers[j] = indexSearcherManagers[j].getIndexSearcher();
          readerArray[j] = indexSearcherManagers[j].getIndexReader();
        }
        try {
          mIndexSearcher = new MultiSearcher(searchers);
          mMultiReader = new MultiReader(readerArray);
          // get the 'first' analyzer (in fact it is a random choice)
          // All indexes has to be build with the same analyzer
          mAnalyzer = indexSearcherManagers[0].getAnalyzer();
          mIndexName = "multiindex";


        } catch (IOException ex) {
          throw new RegainException("Couldn't instantiate MultiSearcher.", ex);
        }


      }


      mIndexConfig = indexConfigs[0];
      mQuery = null;
      if (queryText != null && queryText.length() > 0) {
        // start the creation of the lucene query object


        try {
          mQuery = new BooleanQuery();


          for (int k = 0; k < indexConfigs.length; k++) {


            String[] searchFieldArr = indexConfigs[k].getSearchFieldList();
            for (int i = 0; i < searchFieldArr.length; i++) {


              QueryParser parser = new QueryParser(Version.LUCENE_30, searchFieldArr[i], mAnalyzer);
              parser.setDefaultOperator(QueryParser.AND_OPERATOR);
              parser.setAllowLeadingWildcard(true);
              Query fieldQuery = parser.parse(queryText);


              // Add as OR
              mQuery.add(fieldQuery, Occur.SHOULD);
            }
            //System.out.println("Query: '" + queryText + "' -> '" + mQuery.toString() + "'");


          }
        } catch (ParseException exc) {
          throw new RegainException("Error while parsing search pattern '" + mQueryText +
                  "': " + exc.getMessage(), exc);
        }


        // Check whether access control is used
        if (allGroups != null && allGroups.length > 0) {
          // Create a query that matches any group
          BooleanQuery groupQuery = new BooleanQuery();
          for (int i = 0; i < allGroups.length; i++) {
            // Add as OR
            groupQuery.add(new TermQuery(new Term("groups", allGroups[i])), Occur.SHOULD);
          }


          // Create a main query that contains the group query and the search query
          // combined with AND
          BooleanQuery mainQuery = new BooleanQuery();
          mainQuery.add(mQuery, Occur.MUST);
          mainQuery.add(groupQuery, Occur.MUST);


          // Set the main query as query to use
          mQuery = mainQuery;
        }
      }


      // Add the mimetype field search
      if (mimeTypeFieldText != null) {
        BooleanQuery mimetypeFieldQuery = new BooleanQuery();
        mimetypeFieldQuery.add(new TermQuery(new Term("mimetype",
                mimeTypeFieldText.substring(9).replace("\"", ""))),
                Occur.SHOULD);
        BooleanQuery mainQuery = new BooleanQuery();
        if (mQuery != null) {
          mainQuery.add(mQuery, Occur.MUST);
        }
        mainQuery.add(mimetypeFieldQuery, Occur.MUST);


        // Set the main query as query to use
        mQuery = mainQuery;
      }


      if (mQuery != null) {
        //System.out.println("Query: '" + mQueryText + "' -> '" + mQuery.toString() + "'");


        try {
          SortingOption sortingOption = new SortingOption(request.getParameter("order"));
          Sort sort = new Sort(sortingOption.getSortField());
          //System.out.println(sortingOption.toString());


          topDocsCollector = TopFieldCollector.create(sort, 10000, true, true, true, false);


          mIndexSearcher.search(mQuery, topDocsCollector);
          hitScoreDocs = topDocsCollector.topDocs().scoreDocs;


        } catch (IOException exc) {
          throw new RegainException("Searching query failed", exc);
        }


      }


      String openInNewWindowRegex = indexConfigs[0].getOpenInNewWindowRegex();
      if (openInNewWindowRegex != null) {
        try {
          mOpenInNewWindowRegex = new RE(openInNewWindowRegex);
        } catch (RESyntaxException exc) {
          throw new RegainException("Syntax error in openInNewWindowRegex: '" + openInNewWindowRegex + "'", exc);
        }
      }


    } else {
      // no index given
    }


    mSearchTime = (int) (System.currentTimeMillis() - startTime);


  }


  /**
   * Gets the query text of the search.
   *
   * @return The query text.
   */
  public String getQueryText() {
    return mQueryText;
  }


  /**
   * Gets the number of hits the search had.
   *
   * @return the number of hits the search had.
   */
  @Override
  public int getHitCount() {
    if (hitScoreDocs == null) {
      return 0;
    }
    return hitScoreDocs.length;
  }


  /**
   * Gets the number of documents in the in index.
   *
   * @return the number of indexed documents.
   */
  @Override
  public int getDocumentCount() {
    return mMultiReader.numDocs() - mMultiReader.numDeletedDocs();
  }


  /**
   * Gets the document of one hit. For holding the transformed documents we use
   * a lazy list.
   *
   * @param index The index of the hit.
   * @return the document of one hit.
   *
   * @throws RegainException If getting the document failed.
   * @see Document
   */
  @Override
  public Document getHitDocument(int index) throws RegainException {


    try {
      Document currDoc = (Document) lazyHitList.get(index);
      // The document is empty, so it's created by the factory. Replace it with the real one
      // at this position
      if (currDoc.getFields().isEmpty()) {
        lazyHitList.set(index, mIndexSearcher.doc(hitScoreDocs[index].doc));
      }
    } catch (Exception ex) {
      throw new RegainException("Error while accessing index", ex);
    }
    return (Document) lazyHitList.get(index);


  }


  /**
   * Writes a changed document back to the list.
   * 
   * @param index
   * @param document
   * @throws RegainException
   */
  private void setHitDocument(int index, Document document) throws RegainException {
    lazyHitList.set(index, document);
  }


  /**
   * Gets the score of one hit.
   *
   * @param index The index of the hit.
   * @return the score of one hit.
   *
   * @throws RegainException If getting the score failed.
   * @see Hits#score(int)
   */
  @Override
  public float getHitScore(int index) throws RegainException {
    try {
      return hitScoreDocs[index].score;
    } catch (Exception exc) {
      throw new RegainException("Error while getting score of search hit #" + index, exc);
    }
  }


  /**
   * Gets the time the search took in milliseconds.
   *
   * @return The search time.
   */
  @Override
  public int getSearchTime() {
    return mSearchTime;
  }


  /**
   * Gets whether a hit should be opened in a new window.
   *
   * @param index The index of the hit.
   * @return Whether the hit should be opened in a new window.
   * @throws RegainException If getting the URL failed.
   */
  @Override
  public synchronized boolean getOpenHitInNewWindow(int index)
          throws RegainException {
    String url = getHitUrl(index);


    if (mOpenInNewWindowRegex == null) {
      return false;
    } else {
      if (mOpenInNewWindowRegex.match(url)) {
        return true;
      } else {
        return false;
      }
    }
  }


  /**
   * Gets whether the file-to-http-bridge should be used for a certain hit.
   * <p>
   * Mozilla browsers have a security mechanism that blocks loading file-URLs
   * from pages loaded via http. To be able to load files from the search
   * results, regain offers the file-to-http-bridge that provides all files that
   * are listed in the index via http.
   *
   * @param index The index of the hit.
   * @return Whether the file-to-http-bridge should be used.
   */
  @Override
  public boolean getUseFileToHttpBridgeForHit(int index) {
    return mIndexConfig.getUseFileToHttpBridge();
  }


  /**
   * Gets the url from a hit and rewrites it according to the rewrite rules
   * specified in the index config.
   *
   * @param index The index of the hit to get the URL for.
   * @return The url of the wanted hit.
   * @throws RegainException If getting the hit document failed.
   */
  @Override
  public String getHitUrl(int index) throws RegainException {
    String url = getHitDocument(index).get("url");
    if (url == null) {
      return null;
    }


    // Get the rules
    String[][] rewriteRules = mIndexConfig.getRewriteRules();
    if (rewriteRules != null) {
      for (int i = 0; i < rewriteRules.length; i++) {
        String[] rule = rewriteRules[i];
        String prefix = rule[0];
        if (url.startsWith(prefix)) {
          String replacement = rule[1];
          return replacement + url.substring(prefix.length());
        }
      }
    }


    // The URL does not match any rewrite rule -> Don't change it
    return url;
  }


  /**
   * Gets the name of the index a hit comes from.
   *
   * @param index The index of the hit to get the index name for.
   * @return The name of the index a hit comes from.
   * @throws RegainException If getting the index name failed.
   */
  @Override
  public String getHitIndexName(int index) throws RegainException {
    return mIndexName;
  }


  /**
   * Shortens the summary.
   * 
   * @param index The index of the hit.
   * @throws RegainException if shorten fails.
   */
  @Override
  public void shortenSummary(int index) throws RegainException {


    try {
      Document document = getHitDocument(index);
      byte[] compressedFieldValue = document.getBinaryValue("summary");
      String text = null;
      if (compressedFieldValue != null) {
        text = CompressionTools.decompressString(compressedFieldValue);
      }


      if (text != null) {
        // Overwrite the content with a shortend summary
        String resSummary = RegainToolkit.createSummaryFromContent(text, 200);
        document.removeField("summary");
        if (resSummary != null) {
          document.add(new Field("summary", resSummary, Field.Store.NO, Field.Index.NOT_ANALYZED));
          document.add(new Field("summary", CompressionTools.compressString(resSummary), Field.Store.YES));
          // write back the transformed document
          setHitDocument(index, document);
        }
      }
    } catch (DataFormatException dataFormatException) {
      throw new RegainException("Error while searching pattern: " + mQueryText, dataFormatException);
    }


  }


  /**
   * Highlights fields in the document. Fields for highlighting will be:
   * - summary
   * - title
   *
   * @param index The index of the hit.
   * @throws RegainException If highlighting failed.
   */
  @Override
  public void highlightHitDocument(int index) throws RegainException {


    try {
      IndexSearcherManager manager = IndexSearcherManager.getInstance(mIndexConfig.getDirectory());
      // The highlighter needs a rewritten query to work with wildcard and fuzzy queries
      Query rewrittenQuery = manager.rewrite(mQuery);
      QueryScorer queryScorer = new QueryScorer(rewrittenQuery);
      // End added by Anders


      Highlighter highlighter = new Highlighter(
              new SimpleHTMLFormatter("<span class=\"highlight\">", "</span>"), queryScorer);


      // Remark: the summary is at this point not a summary. It contains the
      // first n characters from the document. n is configurable (default: 250000)
      // We transform this summary into
      // a) a summary matching the search terms (highlighting)
      // b) and a shortend summary (200 characters)
//      int docId = hitScoreDocs[index].doc;


      Document document = getHitDocument(index);
      byte[] compressedFieldValue = document.getBinaryValue("summary");
      String text = null;
      if (compressedFieldValue != null) {
        text = CompressionTools.decompressString(compressedFieldValue);
      }


      if (text != null) {
        // Overwrite the content with a shortend summary
        String resSummary = RegainToolkit.createSummaryFromContent(text, 200);
        document.removeField("summary");
        if (resSummary != null) {
          //System.out.println("resSummary " + resSummary);
          document.add(new Field("summary", resSummary, Field.Store.NO, Field.Index.NOT_ANALYZED));
          document.add(new Field("summary", CompressionTools.compressString(resSummary), Field.Store.YES));


        }


        String resHighlSummary = null;
        // Remove 'html', this works the same way as PageResponse.printNoHTML()
        text = RegainToolkit.replace(text, "<", "&lt;");
        text = RegainToolkit.replace(text, ">", "&gt;");


        TokenStream tokenStream = mAnalyzer.tokenStream("content",
                new StringReader(text));
        // Get 3 best fragments and seperate with a " ... "
        resHighlSummary = highlighter.getBestFragments(tokenStream, text, 3, " ... ");


        if (resHighlSummary != null) {
          //System.out.println("Highlighted summary: " + resHighlSummary);
          // write the result back to the document in a new field
          document.add(new Field("highlightedSummary", resHighlSummary, Field.Store.NO, Field.Index.NOT_ANALYZED));
          document.add(new Field("highlightedSummary", CompressionTools.compressString(resHighlSummary), Field.Store.YES));
        }
      }
      // Highlight the title
      text = document.get("title");
      String resHighlTitle = null;
      if (text != null) {
        TokenStream tokenStream = mAnalyzer.tokenStream("content",
                new StringReader(text));
        // Get the best fragment
        resHighlTitle = highlighter.getBestFragment(tokenStream, text);
      }


      if (resHighlTitle != null) {
        // write the result back to the document in a new field
        //System.out.println("Highlighted title: " + resHighlTitle);
        document.add(new Field("highlightedTitle", resHighlTitle,
                Field.Store.YES, Field.Index.NOT_ANALYZED));


      }
      // write back the transformed document
      setHitDocument(index, document);


      //System.out.println("The document: " + hitDocs.get(index).toString());


    } catch (org.apache.lucene.index.CorruptIndexException exCorr) {
      throw new RegainException("Error while searching pattern: " + mQueryText, exCorr);


    } catch (org.apache.lucene.search.highlight.InvalidTokenOffsetsException exToken) {
      throw new RegainException("Error while searching pattern: " + mQueryText, exToken);


    } catch (IOException exIO) {
      throw new RegainException("Error while searching pattern: " + mQueryText, exIO);


    } catch (DataFormatException ex) {
      throw new RegainException("Error while searching pattern: " + mQueryText, ex);
    }


  }


  /**
   * Gets whether the search terms should be highlighted
   *
   * @return whether to highlight
   * @throws RegainException If the value could not read from config
   */
  @Override
  public boolean getShouldHighlight(int index) throws RegainException {
    return mIndexConfig.getShouldHighlight();
  }
}
Source Code of net.sf.regain.search.results.SearchResultsImpl

Related Classes of net.sf.regain.search.results.SearchResultsImpl