Package org.apache.lucene.search

Source Code of org.apache.lucene.search.PwaScorer

package org.apache.lucene.search;

import java.io.IOException;
import java.util.*;

import org.apache.lucene.index.*;
import org.apache.lucene.search.caches.PwaDateCache;
import org.apache.lucene.search.caches.PwaIndexStats;
import org.apache.lucene.search.caches.PwaStopwords;
import org.apache.lucene.search.filters.PwaFilter;
import org.apache.lucene.search.filters.PwaDateClosestFilter;
import org.apache.lucene.search.filters.PwaDateRangeFilter;
import org.apache.lucene.search.filters.PwaFilterChain;
import org.apache.lucene.search.filters.PwaBlacklistFilter;
import org.apache.lucene.search.queries.PwaClosestQuery;
import org.apache.lucene.search.queries.PwaSortQuery;
import org.apache.lucene.search.features.PwaLinearRankingModel;
import org.apache.lucene.search.features.PwaScores;


/**
* Matches and scores documents according to the query
* @author Miguel Costa
*/
public class PwaScorer extends Scorer {
 
  private final static int MIN_TF_ANCHORS=4; // minimum number of terms that occur only in the anchor field when matching a document 
  private static enum ScoreType {NORMAL, FLAT, ONLY_ONE, DATE_SORTED, DATE_SORTED_REVERSE};
  private BooleanQuery query;
  private Searcher searcher;  
  private IndexReader reader;
  private PwaFilterChain chainFilter;
  private Vector<PwaFilter> filters; // filters to apply
  private PwaJoiner joiner;
  private PwaFunctionsWritable functions;
  private PwaIndexStats indexstats;
  private boolean empty;
  private ScoreType scoreType;
  private long queryTimestamp;
 
  /**
   * Constructor
   * @param query query
   * @param searchersearcher
   * @param reader reader
   * @param functions ranking functions
   * @throws IOException
   */
  public PwaScorer(BooleanQuery query, Searcher searcher, IndexReader reader, PwaFunctionsWritable functions) throws IOException {
    super(null);
      this.query=query;
      this.searcher=searcher;
      this.reader=reader;
      this.functions=functions;
      this.indexstats=PwaIndexStats.getInstance(reader);
          
      this.chainFilter=null;
      this.filters=new Vector<PwaFilter>();
      this.joiner=null;
      this.empty=false;
      this.queryTimestamp=System.currentTimeMillis();
      init();
 
 
  /**
   * Initialize scorer
   * @throws IOException
   */
  private void init() throws IOException {
   
    LinkedHashMap htableMergers=new LinkedHashMap()// to separate query terms by term text (e.g. lisbon:url,lisbon:content from portugal:url,portugal:content)
    LinkedHashMap htableMergersExclude=new LinkedHashMap()// to separate query terms by term text to exclude (e.g. -lisbon:url,-lisbon:content from -portugal:url,-portugal:content)   
    LinkedHashMap htableMergersExtra=new LinkedHashMap()// to separate query terms by extra term field (e.g. type:pdf,type:ps from site:www.fccn.pt)
    LinkedHashMap htableMergersExtraExclude=new LinkedHashMap()// to separate query terms by extra term fields to exclude
    LinkedHashMap htablePositions=new LinkedHashMap()// to separate query terms by term field (e.g. lisbon:url,portugal:url from lisbon:content,portugal:content)     
    PwaSearchable termAux=null;
    boolean exclude=false;
    String termText=null;
    String termField=null;   
    boolean isOnlyPhrasesForRank=true; // phrase search has always distance 0, so the rank value is the same for all
   
    // do not rank results for now
    scoreType=ScoreType.FLAT;
   
    // extract terms and separate terms per text and field
    List lclauses=query.clauses();   
    for (int i=0; i<lclauses.size(); i++) {
           
      BooleanClause clause=(BooleanClause)lclauses.get(i);     
      List lclausesInside=processClause(clause);     
           
      // separate terms per text and field
        for (int j=0; j<lclausesInside.size(); j++) {        
          BooleanClause clauseInside = (BooleanClause)lclausesInside.get(j);
                                     
          Vector terms=new Vector();
          extractTerms(clauseInside.getQuery(),terms);
          Term termsArray[]=new Term[terms.size()];
          terms.toArray(termsArray);
        termField=termsArray[0].field();
         
          if (clause.getOccur()==BooleanClause.Occur.MUST_NOT) {         
            exclude=true;
          }
          else {
            exclude=false;
          }
         
          termAux=null;
          // create PwaTerm or PwaPhrase
          if (clauseInside.getQuery() instanceof TermQuery) {
            termText=termsArray[0].text();       
           
            if (!PwaStopwords.getInstance(reader).contains(termField,termText)) { // if term is not a stopword then process it, otherwise ignore it           
              if (indexstats.isField(termField) && !exclude) { // it must be a term field and not a "NOT" term, which do not require positional data                                                             
                if (termField.equals(PwaIndexStats.ANCHOR_DEF)) { // BUG nutchwax 0000202
                  termAux=new PwaTermLimited(termsArray[0],reader,MIN_TF_ANCHORS);
                }
                else {
                  termAux=new PwaTerm(termsArray[0],reader)
                }                                                     
                isOnlyPhrasesForRank=false;                 
              }
              else {
                termAux=new PwaExtraTerm(termsArray[0],reader);                 
              }                           
            }
          }
          else  if (clauseInside.getQuery() instanceof PhraseQuery && indexstats.isField(termField)) { // for phrase it must be a term                     
            Vector<PwaTerm> vecAux=new Vector<PwaTerm>()// terms searched in phrase                                       
            Vector<Integer> vecOffsetTerms=new Vector<Integer>(); // offset of query terms (for phrase processing)
            int stopwordsAtBegin=0;
           
            for (int k=0;k<termsArray.length; k++) {   
              // remove first and last terms if they are stopwords
              if (!PwaStopwords.getInstance(reader).contains(termField,termsArray[k].text())) { // if term is not a stopword process it, else ignore it                
                if (termField.equals(PwaIndexStats.ANCHOR_DEF)) { // BUG nutchwax 0000202
                  vecAux.add(new PwaTermLimited(termsArray[k],reader,MIN_TF_ANCHORS));
                }
                else {
                  vecAux.add(new PwaTerm(termsArray[k],reader))
                }                 
                vecOffsetTerms.add(k-stopwordsAtBegin);
              }
              else {
                if (vecOffsetTerms.size()==0) { // to exclude stopwords at beginning
                  stopwordsAtBegin++;
                }
              }
            }
            if (!exclude) {
              termAux=new PwaPhrase(vecAux,vecOffsetTerms);
            }
            else {
              termAux=new PwaExtraPhrase(vecAux,vecOffsetTerms); // does not need to collectFeatures
            }                                                                 
            StringBuffer sbuf=new StringBuffer();
            for (int k=0; k<termsArray.length; k++) {
              if (k>0) {  
                sbuf.append(" ");
              }           
              sbuf.append(termsArray[k].text());                                     
            }
            termText=sbuf.toString();
          }
          else {           
            empty=true;
            return;
          }
                                
          if (termAux!=null) { // if it is not a stopword partition query per text and field
            if (indexstats.isField(termField)) { 
              if (!exclude) {                 
                addTerms2Map(htableMergers, termAux, termText); // by term text                                           
                addTerms2Map(htablePositions, termAux, termField); // by term field
              }       
              else {         
                addTerms2Map(htableMergersExclude, termAux, termText); // by term text                           
              }
            }           
            else { // other fields like DOCNUM, type, site                     
              if (!exclude) {
                addTerms2Map(htableMergersExtra, termAux, termField); // by term field               
              }
              else {
                addTerms2Map(htableMergersExtraExclude, termAux, termField); // by term field               
              }
            }
          }
        }
    }                

                         
    // check empty query - at least a term must must be valid
    if (htableMergers.size()==0 && htableMergersExtra.size()==0) {
      empty=true;
      return;
    }
   
    // preparing for matching documents
    prepareMatching(isOnlyPhrasesForRank, htableMergers, htableMergersExtra, htableMergersExclude, htableMergersExtraExclude, htablePositions);     
  }
 
 
  /**
   * Processes query clause
   * @param clause query clause
   * @return sub-clauses extracted
   * @throws IOException
   */
  private List processClause(BooleanClause clause) throws IOException {   
    List lclausesInside=new Vector();
    if (clause.getQuery() instanceof TermQuery) { // add term                       
      lclausesInside.add(clause);       
    }
    else if (clause.getQuery() instanceof PhraseQuery) { // add phrase
      lclausesInside.add(clause);               
    }
    else if (clause.getQuery() instanceof PwaClosestQuery) { // add filter - date closest
      PwaClosestQuery query = (PwaClosestQuery)clause.getQuery();
      filters.add(new PwaDateClosestFilter(reader, query.getText()));
      scoreType=ScoreType.ONLY_ONE; // only one result, so it does not need to rank        
    }
    else if (clause.getQuery() instanceof RangeQuery) { // add filter - date range
          RangeQuery query = (RangeQuery)clause.getQuery();
          filters.add(new PwaDateRangeFilter(reader, query.getLowerTerm().text(), query.getUpperTerm().text()));
    }     
    else if (clause.getQuery() instanceof PwaSortQuery) { // add filter - sort results by date
      PwaSortQuery query = (PwaSortQuery)clause.getQuery();
      if (query.getField().equals("date")) { // it does not need to rank by score, because results are sorted by date
        if (query.getReverse()) {
          scoreType=ScoreType.DATE_SORTED_REVERSE;
        }
        else {
          scoreType=ScoreType.DATE_SORTED;
        }
      }
    }           
    else { // BooleanQuery     
      lclausesInside=((BooleanQuery)clause.getQuery()).clauses();
    }   
   
    return lclausesInside;
  }
 
 
  /**
   * Prepares joiner, mergers and filters for matching documents
   * @param isOnlyPhrasesForRank indicates if query has only phrases or not
   * @param htableMergers
   * @param htableMergersExtradouble score=0;
   * @param htableMergersExclude
   * @param htableMergersExtraExclude
   * @param htablePositions
   * @throws IOException
   */
  private void prepareMatching(boolean isOnlyPhrasesForRank, LinkedHashMap htableMergers, LinkedHashMap htableMergersExtra, LinkedHashMap htableMergersExclude, LinkedHashMap htableMergersExtraExclude, LinkedHashMap htablePositions) throws IOException {
 
    // set mergers for terms
    Vector<PwaMerger> mergers=new Vector<PwaMerger>();
    addTerms2Merger(mergers, htableMergers, false)
   
    // set mergers for extra terms
    Vector<PwaMerger> mergersExtra=new Vector<PwaMerger>();
    addTerms2Merger(mergersExtra, htableMergersExtra, false);
 
    // set mergers for terms to exclude 
    Vector<PwaMerger> mergersExclude=new Vector<PwaMerger>();
    addTerms2Merger(mergersExclude, htableMergersExclude, true);   
 
    // set mergers for extra terms to exclude
    Vector<PwaMerger> mergersExtraExclude=new Vector<PwaMerger>();
    addTerms2Merger(mergersExtraExclude, htableMergersExtraExclude, true);       
 
    // join mergers and the exclude mergers after all others (this order must be followed)   
    Vector<PwaSearchable> joinAll=new Vector<PwaSearchable>();
    if (mergers.size()>0) {
      joinAll.addAll(mergers); // add mergers
      if (scoreType==ScoreType.FLAT) {
        scoreType=ScoreType.NORMAL;
      }
    }
    if (mergersExtra.size()>0) {
      joinAll.addAll(mergersExtra); // add mergers for extra terms
    }
    if (mergersExclude.size()>0) {
      joinAll.addAll(mergersExclude); // add mergers with terms to exclude
    }
    if (mergersExtraExclude.size()>0) {
      joinAll.addAll(mergersExtraExclude); // add mergers with extra terms for exclude
    }
    joiner=new PwaJoiner(joinAll);

    // set positions manager
    Vector<PwaPositionsManager> posmanagers=new Vector<PwaPositionsManager>();
    Vector<PwaSearchable> vecTermsAux=null;
    if (mergers.size()>0 && !isOnlyPhrasesForRank) {       
      for (int i=0;i<PwaIndexStats.FIELDS.length;i++) {       
        vecTermsAux=(Vector)htablePositions.get(PwaIndexStats.FIELDS[i]);                   
        Vector<PwaTerm> vecAllTerms = new Vector<PwaTerm>();
        for (int k=0;k<vecTermsAux.size();k++) {
          if (vecTermsAux.get(k) instanceof PwaTerm) {
            vecAllTerms.add((PwaTerm)vecTermsAux.get(k));
          }
          else { // PwaPhrase
            vecAllTerms.addAll(((PwaPhrase)vecTermsAux.get(k)).getTerms());
          }
        }
   
        posmanagers.add(new PwaPositionsManager(vecAllTerms));         
      }                
    }
    joiner.setPositionsManager(posmanagers);
   
    // set filter chain
    filters.add(new PwaBlacklistFilter(reader)); // add the blacklist filter
    chainFilter=new PwaFilterChain(filters,joiner);   // set chain filter
  }
 
  /**
   * Add terms to maps to separate terms
   * @param htableMergers
   * @param term term query term
   * @param termText text or field of term
   */
  private void addTerms2Map(LinkedHashMap htableMergers, PwaSearchable term, String termText) {
    Vector<PwaSearchable> vecTermsAux=(Vector)htableMergers.get(termText);
    if (vecTermsAux==null) {
      vecTermsAux=new Vector<PwaSearchable>();         
    }     
    vecTermsAux.add(term);
    htableMergers.put(termText, vecTermsAux);
  }
 
  /**
   * Add terms to mergers
   * @param mergers
   * @param htableMergers
   * @param exclude
   */
  private void addTerms2Merger(Vector<PwaMerger> mergers, LinkedHashMap htableMergers, boolean exclude) {
    Vector<PwaSearchable> vecTermsAux=null;
    Vector<PwaSearchable> vecTermsSearchableAux=null;   
    for (Iterator iter=htableMergers.values().iterator();iter.hasNext();) {
      vecTermsAux=(Vector)iter.next();     
      vecTermsSearchableAux=new Vector<PwaSearchable>();
      vecTermsSearchableAux.addAll(vecTermsAux);
      mergers.add(new PwaMerger(vecTermsSearchableAux,exclude));
    }
  }
 
  /**
   * Extract terms
   * @param terms query terms
   * @note the extractTerms of Lucene uses a Set that eliminates duplicates. This is a problem for phrase queries. BUG nutchwax 0000583
   */
  private void extractTerms(Query query, Vector terms) {
    if (query instanceof TermQuery) {                        
      terms.add(((TermQuery)query).getTerm());   
    }
    else if (query instanceof PhraseQuery) {
      terms.addAll(Arrays.asList(((PhraseQuery)query).getTerms()));        
    }
    else if (query instanceof BooleanQuery) {
      List lclauses=((BooleanQuery)query).clauses();
      for (Iterator i = lclauses.iterator(); i.hasNext();) {
        BooleanClause clause = (BooleanClause) i.next();
        extractTerms(clause.getQuery(), terms);
     
    }   
  }

 
  /**
   * Scores and collects all matching documents
   * @param hc the collector to which all matching documents are passed through
   * {@link HitCollector#collect(int, float)}.
   * <br>When this method is used the {@link #explain(int)} method should not be used.
   */
  public void score(HitCollector hc) throws IOException
    while (next()) {
      hc.collect(doc(), score());         
    }               
  }   
   
  /**
   * Move to next document
   * @return true if has more documents; false otherwise
   */
  public boolean next() throws IOException {
    return !empty && chainFilter.next();
 
   
  /**
   * Get document id
   * @return document id
   */
  public int doc() {
    return chainFilter.doc();
  }     
     
  /**
   * Get document score
   */
  public float score() throws IOException {       
    if (scoreType==ScoreType.NORMAL) {     
      PwaRawFeatureCollector collector=new PwaRawFeatureCollector(reader);
      joiner.collectFeatures(doc(),collector);   
      PwaScores scores=PwaScorerFeatures.score(doc(),queryTimestamp,collector,joiner.getPositionsManager(),searcher,functions);
      return (new PwaLinearRankingModel()).score(functions, scores); // TODO parameterize the ranking model in the future
    }
    else if (scoreType==ScoreType.DATE_SORTED || scoreType==ScoreType.DATE_SORTED_REVERSE) { // results are sorted in TopDocCollector
      PwaDateCache sortCache=new PwaDateCache(reader);
      return sortCache.getTimestamp(doc());     
    }
    else { // flat ranking
      return 1;
    }
  }
   
  /**
   * Display ranking data
   * @param doc document id
   * @return
   * @throws IOException
   * @note a new PwaScorer should be created for each explain call. This method has lack of efficiency, it is just for debugging purposes.
   */
  public Explanation explain(int doc) throws IOException {   
    if (!joiner.skipToFromStart(doc)) {
      throw new IOException("Explain failed skipToFromStart:"+doc);
    }     
    if (doc!=doc()) {   //sanity check
      throw new IOException("Explain with different doc ids:"+doc+" "+doc());
   
                
    PwaRawFeatureCollector collector=new PwaRawFeatureCollector(reader);
    joiner.collectFeatures(doc(),collector);         
    return PwaScorerFeatures.explain(doc(),queryTimestamp,collector,joiner.getPositionsManager(),searcher,functions);            
  }
       
  /**
   * Skip to document @doc or superior
   * @return true if skip to document @targetDoc or superior; false otherwise
   */
  public boolean skipTo(int targetDoc) throws IOException {    
    throw new IOException("this method should not be called!");
  }
}
TOP

Related Classes of org.apache.lucene.search.PwaScorer

TOP
Copyright © 2018 www.massapi.com. All rights reserved.
All source code are property of their respective owners. Java is a trademark of Sun Microsystems, Inc and owned by ORACLE Inc. Contact coftware#gmail.com.