Package ivory.cascade.retrieval

Source Code of ivory.cascade.retrieval.CascadeEval

/*
* Ivory: A Hadoop toolkit for web-scale information retrieval
*
* Licensed under the Apache License, Version 2.0 (the "License"); you
* may not use this file except in compliance with the License. You may
* obtain a copy of the License at
*
* http://www.apache.org/licenses/LICENSE-2.0
*
* Unless required by applicable law or agreed to in writing, software
* distributed under the License is distributed on an "AS IS" BASIS,
* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or
* implied. See the License for the specific language governing
* permissions and limitations under the License.
*/

package ivory.cascade.retrieval;

import ivory.cascade.model.CascadeClique;
import ivory.core.RetrievalEnvironment;
import ivory.core.exception.ConfigurationException;
import ivory.core.exception.RetrievalException;
import ivory.smrf.model.Clique;
import ivory.smrf.model.DocumentNode;
import ivory.smrf.model.GlobalTermEvidence;
import ivory.smrf.model.GraphNode;
import ivory.smrf.model.MarkovRandomField;
import ivory.smrf.model.score.ScoringFunction;
import ivory.smrf.retrieval.Accumulator;

import java.util.ArrayList;
import java.util.Arrays;
import java.util.BitSet;
import java.util.Collections;
import java.util.Comparator;
import java.util.List;
import java.util.Map;
import java.util.PriorityQueue;
import java.util.Set;

import org.apache.log4j.Logger;


import com.google.common.collect.Maps;
import com.google.common.collect.Sets;

/**
* @author Lidan Wang
*/
public class CascadeEval {
  private static final Logger LOG = Logger.getLogger(CascadeEval.class);

  static int INITIAL_STAGE_NUM_RESULTS = 20000;

  /**
   * Pool of accumulators.
   */
  private CascadeAccumulator[] mAccumulators = null;

  /**
   * Sorted list of accumulators.
   */
  private final PriorityQueue<CascadeAccumulator> mSortedAccumulators = new PriorityQueue<CascadeAccumulator>();

  /**
   * Comparator used to sort cliques by their max score.
   */
  private final Comparator<Clique> maxScoreComparator = new Clique.MaxScoreComparator();

  /**
   * Markov Random Field that we are using to generate the ranking.
   */
  private MarkovRandomField mMRF = null;

  /**
   * If defined, only documents within this set will be scored.
   */
  private int[] mDocSet = null;
  float[] accumulated_scores = null;

  // Declare these so don't have to repeatedly declaring them in the methods
  double[] mDocSet_tmp;
  float[] accumulated_scores_tmp;
  int[] order;

  /**
   * MRF document nodes.
   */
  private List<DocumentNode> mDocNodes = null;

  /**
   * Maximum number of results to return.
   */
  private int mNumResults;

  // saved results from internalInputFile
  private float[][] mSavedResults;

  // K value used in cascade model
  private int mK;

  // Cost of this cascade model = # documents * sum of unit per document cost over the cliques
  float cascadeCost = 0;


  // docs that will be passed around
  int[][][] keptDocs;
  int[] keptDocLengths;

  // single terms in cliques used in first stage, which clique number they correspond to, keyed by
  // the concept, value is the cliqueNumber or termCollectionFrequency
  Map<String, Integer> termToCliqueNumber = Maps.newHashMap();
  Map<String, Long> cf = Maps.newHashMap();
  Map<String, Integer> df = Maps.newHashMap();

  // for pruning use
  float meanScore = 0;
  float stddev = 0;

  int numQueryTerms;

  public static int defaultNumDocs = 9999999;

  public CascadeEval(MarkovRandomField mrf, int numResults, String qid, float[][] savedResults,
      int K) {
    this(mrf, null, numResults, qid, savedResults, K);
  }

  public CascadeEval(MarkovRandomField mrf, int[] docSet, int numResults, String qid,
      float[][] savedResults, int K) {
    mMRF = mrf;
    mDocSet = docSet;
    mNumResults = numResults;
    mDocNodes = getDocNodes();
    mSavedResults = savedResults;
    mK = K;

    // Lidan: get # query terms
    numQueryTerms = mMRF.getQueryTerms().length;

    keptDocs = new int[INITIAL_STAGE_NUM_RESULTS + 1][numQueryTerms][];

    keptDocLengths = new int[INITIAL_STAGE_NUM_RESULTS + 1];
  }

  // Lidan: assuming mDocSet[] & accumulated_scores[] sorted by descending order of scores!
  // Lidan: this method modifies mDocSet[] & accumulated_scores[] (class variables)
  public void pruneDocuments(String pruner, float pruner_param) {

    // After pruning, make sure have max(RetrievalEnvironment.mCascade_K, |retained docs|)
    // documents!

    int[] mDocSet_tmp = new int[mDocSet.length];
    float[] accumulated_scores_tmp = new float[accumulated_scores.length];

    int retainSize = 0;

    if (pruner.equals("score")) {
      float max_score = accumulated_scores[0];
      float min_score = accumulated_scores[accumulated_scores.length - 1];

      float score_threshold = (max_score - min_score) * pruner_param + min_score;

      for (int i = 0; i < accumulated_scores.length; i++) {
        if (score_threshold <= accumulated_scores[i]) {
          retainSize++;
        } else {
          break;
        }
      }
    } else if (pruner.equals("mean-max")) {
      float max_score = accumulated_scores[0];
      float mean_score = 0;
      for (int j = 0; j < accumulated_scores.length; j++) {
        mean_score += accumulated_scores[j];
      }
      mean_score = mean_score / (float) accumulated_scores.length;
      float score_threshold = pruner_param * max_score + (1.0f - pruner_param) * mean_score;

      for (int i = 0; i < accumulated_scores.length; i++) {
        if (score_threshold <= accumulated_scores[i]) {
          retainSize++;
        } else {
          break;
        }
      }
    } else if (pruner.equals("rank")) {
      // if pruner_param = 0.3 --> remove bottom 30% of the docs!
      retainSize = (int) ((1.0 - pruner_param) * ((double) (mDocSet.length)));
    } else if (pruner.equals("z-score")) {
      // compute mean
      float avgScores = 0.0f;

      for (int i = 0; i < accumulated_scores.length; i++) {
        avgScores += accumulated_scores[i];
      }
      avgScores = avgScores / (float) accumulated_scores.length;

      // compute variance
      float variance = 0.0f;
      for (int i = 0; i < accumulated_scores.length; i++) {
        variance += (accumulated_scores[i] - avgScores) * (accumulated_scores[i] - avgScores);
      }
      float stddev = (float) Math.sqrt(variance);

      float[] z_scores = new float[accumulated_scores.length];
      for (int i = 0; i < z_scores.length; i++) {
        z_scores[i] = (accumulated_scores[i] - avgScores) / stddev;
      }
    } else {
      throw new RetrievalException("PruningFunction " + pruner + " is not supported!");
    }

    if (retainSize < mK) {
      if (mDocSet.length >= mK) {
        retainSize = mK;
      } else if (mK != defaultNumDocs) {
        // When training the model, set the # output docs large on purpose so that output size =
        // retained docs size

        retainSize = mDocSet.length;
      }
    }

    if (retainSize > mDocSet.length) {
      retainSize = mDocSet.length;
    }

    for (int i = 0; i < retainSize; i++) {
      mDocSet_tmp[i] = mDocSet[i];
      accumulated_scores_tmp[i] = accumulated_scores[i];
    }
    mDocSet = new int[retainSize];
    accumulated_scores = new float[retainSize];

    for (int i = 0; i < retainSize; i++) {
      mDocSet[i] = mDocSet_tmp[i];
      accumulated_scores[i] = accumulated_scores_tmp[i];
    }

  }

  // Lidan: operate on class vars mDocSet[] & accumulated_scores
  public void sortDocumentsByDocnos() {
    order = new int[mDocSet.length];
    mDocSet_tmp = new double[mDocSet.length];
    accumulated_scores_tmp = new float[mDocSet.length];

    for (int i = 0; i < order.length; i++) {
      order[i] = i;
      mDocSet_tmp[i] = mDocSet[i];
      accumulated_scores_tmp[i] = accumulated_scores[i];
    }

    ivory.smrf.model.constrained.ConstraintModel.Quicksort(mDocSet_tmp, order, 0, order.length - 1);

    for (int i = 0; i < order.length; i++) {
      mDocSet[i] = (int) mDocSet_tmp[i];
      accumulated_scores[i] = accumulated_scores_tmp[order[i]];
    }
  }

  // Total cost of the cascade model: # documents * sum of unit per document cost over each clique
  public float getCascadeCost() {
    // Lidan: should cast it to [0, 1]
    float normalizedCost = 1.0f - (float) (Math.exp(-0.01 * cascadeCost / 50000));
    return normalizedCost;
  }

  public Accumulator[] rank() {
    if (mSavedResults != null) {
      mDocSet = new int[mSavedResults.length];
      accumulated_scores = new float[mSavedResults.length];

      for (int i = 0; i < mSavedResults.length; i++) {
        mDocSet[i] = (int) mSavedResults[i][0];
        accumulated_scores[i] = mSavedResults[i][1];
      }

      keptDocs = new int[mDocSet.length + 1][numQueryTerms][];
      keptDocLengths = new int[mDocSet.length + 1];
    }

    // Initialize the MRF ==> this will clear out postings readers cache!
    try {
      mMRF.initialize();
    } catch (ConfigurationException e) {
      LOG.error("Error initializing MRF. Aborting ranking!");
      return null;
    }

    int totalCnt = mMRF.getCliques().size();
    Map<Integer, Set<CascadeClique>> cascadeStages = Maps.newHashMap();
    for (Clique c : mMRF.getCliques()) {
      CascadeClique cc = (CascadeClique) c;
      int stage = cc.getCascadeStage();
      if ( cascadeStages.containsKey(stage)) {
        cascadeStages.get(stage).add(cc);
      } else {
        cascadeStages.put(stage, Sets.newHashSet(cc));
      }
    }

    CascadeAccumulator[] results = null;
    // Cascade stage starts at 0
    int cascadeStage = 0;
    int cnt = 0;

    String pruningFunction = null;
    float pruningParameter = -1;
    int termMatches = 0;

    while (cnt != totalCnt) { // if not have gone thru all cascade stages
      float subTotal_cascadeCost = 0;

      if (cascadeStage < 1) { // only call once, then use keptDocs[][][]
        mMRF.removeAllCliques();

        for (CascadeClique c : cascadeStages.get(cascadeStage)) {
          mMRF.addClique(c);
          cnt++;

          pruningFunction = c.getPruningFunction();
          pruningParameter = c.getPruningParameter();

          int numDocs = Integer.MAX_VALUE;

          if (mDocSet == null) {
            numDocs = c.getNumberOfPostings();
            // (not) ignore cost of first stage from the cost model
            subTotal_cascadeCost += c.cost * numDocs;
          } else {
            subTotal_cascadeCost += c.cost;
          }
        }

        if (mDocSet != null) {
          // Lidan: mDocSet[] & accumulated_scores[] should be sorted by doc scores!
          // Lidan: this method opereates on mDocSet[] & accumulated_scores[]!
          pruneDocuments(pruningFunction, pruningParameter);

          // Lidan: will score all documents in the retained documenet set
          mNumResults = mDocSet.length;

          sortDocumentsByDocnos();

          // Cost = cost of applying the feature on the retained documents after pruning
          subTotal_cascadeCost = subTotal_cascadeCost * mNumResults;
        } else {
          // Lidan: first cascade stage, just output 20000 documents
          mNumResults = INITIAL_STAGE_NUM_RESULTS;

          if (cascadeStage != 0) {
            System.out.println("Should be the first stage here!");
            System.exit(-1);
          }
        }

        // Create single pool of reusable accumulators.
        mAccumulators = new CascadeAccumulator[mNumResults + 1];
        for (int i = 0; i < mNumResults + 1; i++) {
          mAccumulators[i] = new CascadeAccumulator(0, 0.0f);
        }

        results = executeInitialStage();

        cascadeStage++;
      } else {
        String featureID = null;
        ScoringFunction scoringFunction = null;

        int mSize = -1;
        String[][] concepts_this_stage = new String[totalCnt][];
        float[] clique_wgts = new float[concepts_this_stage.length];

        int cntConcepts = 0;

        for (CascadeClique c : cascadeStages.get(cascadeStage)) {
          cnt++;
          pruningFunction = c.getPruningFunction();
          pruningParameter = c.getPruningParameter();

          featureID = c.getParamID().trim(); // termWt, orderedWt, unorderedWt
          scoringFunction = c.getScoringFunction();

          mSize = c.getWindowSize(); // window width
          if (mSize == -1 && !(featureID.equals("termWt"))) {
            throw new RetrievalException("Only term features don't support getWindowSize()! " + featureID);
          }
          concepts_this_stage[cntConcepts] = c.getSingleTerms();
          clique_wgts[cntConcepts] = c.getWeight();

          cntConcepts++;
          subTotal_cascadeCost += c.cost;
        }

        // for use in pruning

        // score-based
        float max_score = results[0].score;
        float min_score = results[results.length - 1].score;
        float score_threshold = (max_score - min_score) * pruningParameter + min_score;
        float mean_max_score_threshold = pruningParameter * max_score + (1.0f - pruningParameter) * meanScore;

        // rank-based
        int retainSize = (int) ((1.0 - pruningParameter) * ((double) (results.length)));
        int size = 0;

        // Clear priority queue.
        mSortedAccumulators.clear();

        float[] termCollectionFreqs = new float[cntConcepts];
        float[] termDFs = new float[cntConcepts];
        int[][] termIndexes = new int[cntConcepts][];

        float sumScore = 0;

        for (int j = 0; j < cntConcepts; j++) {
          String[] singleTerms = concepts_this_stage[j];

          int termIndex1 = termToCliqueNumber.get(singleTerms[0]);

          if (featureID.indexOf("termWt") != -1) {
            float termCollectionFreq = cf.get(singleTerms[0]);
            termCollectionFreqs[j] = termCollectionFreq;

            float termDF = df.get(singleTerms[0]);
            termDFs[j] = termDF;

            termIndexes[j] = new int[1];
            termIndexes[j][0] = termIndex1;

            if (singleTerms.length != 1) {
              System.out.println("Should have length 1 " + singleTerms.length);
              System.exit(-1);
            }
          } else {
            int termIndex2 = termToCliqueNumber.get(singleTerms[1]);

            termIndexes[j] = new int[2];
            termIndexes[j][0] = termIndex1;
            termIndexes[j][1] = termIndex2;

            if (singleTerms.length != 2) {
              System.out.println("Should have length 2 " + singleTerms.length);
              System.exit(-1);
            }
          }
        }

        // iterate over results documents, which are sorted in scores
        for (int i = 0; i < results.length; i++) {
          // pruning, if okay, scoring, update pruning stats for next cascade stage

          boolean passedPruning = false;
          if (pruningFunction.equals("rank")) {
            if (i < retainSize) {
              passedPruning = true;
            } else {
              if (size < mK && mK != defaultNumDocs) {
                passedPruning = true;
              } else {
                break;
              }
            }
          } else if (pruningFunction.equals("score")) {
            if (results[i].score > score_threshold) {
              passedPruning = true;
            } else {
              if (size < mK && mK != defaultNumDocs) {
                passedPruning = true;
              } else {
                break;
              }
            }
          } else if (pruningFunction.equals("mean-max")) {
            if (results[i].score > mean_max_score_threshold) {
              passedPruning = true;
            } else {
              if (size < mK && mK != defaultNumDocs) {
                passedPruning = true;
              } else {
                break;
              }
            }
          } else {
            throw new RetrievalException("Not supported pruner! "+pruningFunction);
          }

          if (passedPruning) {
            size++;

            int docIndex = results[i].index_into_keptDocs;
            int docLen = keptDocLengths[docIndex];
            float docScore_cascade = 0;

            for (int j = 0; j < cntConcepts; j++) {
              if (featureID.equals("termWt")) {
                int termIndex1 = termIndexes[j][0];
                int[] positions1 = keptDocs[docIndex][termIndex1];

                int tf = 0;
                if (positions1 != null) {
                  tf = positions1.length;
                }

                docScore_cascade += clique_wgts[j] * scoringFunction.getScore(tf, docLen);

              } else { // term proximity

                // merge into a single stream and compute matches. Assume there are only two
                // terms!!!

                int termIndex1 = termIndexes[j][0];
                int termIndex2 = termIndexes[j][1];

                int[] positions1 = keptDocs[docIndex][termIndex1];
                int[] positions2 = keptDocs[docIndex][termIndex2];

                int matches = 0;

                if (positions1 != null && positions2 != null) { // both query terms are in the doc

                  termMatches++;
                  int[] ids = new int[positions1.length];
                  Arrays.fill(ids, 0);
                  int length = positions1.length;

                  int length2 = positions2.length;

                  int[] newPositions = new int[length + length2];
                  int[] newIds = new int[length + length2];

                  int posA = 0;
                  int posB = 0;

                  int ii = 0;
                  while (ii < length + length2) {
                    if (posB == length2 || posA < length && positions1[posA] <= positions2[posB]) {
                      newPositions[ii] = positions1[posA];
                      newIds[ii] = ids[posA];
                      posA++;
                    } else {
                      newPositions[ii] = positions2[posB];
                      newIds[ii] = 1;
                      posB++;
                    }
                    ii++;
                  }

                  int[] positions = newPositions;
                  ids = newIds;

                  BitSet mMatchedIds = new BitSet(2); // Assume there are only two terms!!!

                  if (featureID.equals("orderedWt")) {

                    for (ii = 0; ii < positions.length; ii++) {
                      mMatchedIds.clear();
                      int maxGap = 0;
                      boolean ordered = true;
                      mMatchedIds.set(ids[ii]);
                      int matchedIDCounts = 1;
                      int lastMatchedID = ids[ii];
                      int lastMatchedPos = positions[ii];

                      for (int jj = ii + 1; jj < positions.length; jj++) {
                        int curID = ids[jj];
                        int curPos = positions[jj];
                        if (!mMatchedIds.get(curID)) {
                          mMatchedIds.set(curID);
                          matchedIDCounts++;
                          if (curID < lastMatchedID) {
                            ordered = false;
                          }
                          if (curPos - lastMatchedPos > maxGap) {
                            maxGap = curPos - lastMatchedPos;
                          }
                        }
                        // stop looking if the maximum gap is too large
                        // or the terms appear out of order
                        if (maxGap > mSize || !ordered) {
                          break;
                        }
                        // did we match all the terms, and in order?
                        if (matchedIDCounts == 2 && ordered) {
                          matches++;
                          break;
                        }
                      }
                    }
                  } else if (featureID.equals("unorderedWt")) {

                    for (ii = 0; ii < positions.length; ii++) {
                      mMatchedIds.clear();

                      mMatchedIds.set(ids[ii]);
                      int matchedIDCounts = 1;
                      int startPos = positions[ii];

                      for (int jj = ii + 1; jj < positions.length; jj++) {
                        int curID = ids[jj];
                        int curPos = positions[jj];
                        int windowSize = curPos - startPos + 1;

                        if (!mMatchedIds.get(curID)) {
                          mMatchedIds.set(curID);
                          matchedIDCounts++;
                        }
                        // stop looking if we've exceeded the maximum window size
                        if (windowSize > mSize) {
                          break;
                        }
                        // did we match all the terms?
                        if (matchedIDCounts == 2) {
                          matches++;
                          break;
                        }
                      }
                    }
                  } else {
                    System.out.println("Invalid featureID " + featureID);
                    System.exit(-1);
                  }
                } // end if this is a match, i.e., both query terms are in the doc

//                float s = getScore(matches, docLen, RetrievalEnvironment.defaultCf,
//                    (float) RetrievalEnvironment.defaultDf, scoringFunctionName);
//                docScore_cascade += clique_wgts[j] * s;
               
                GlobalTermEvidence termEvidence = scoringFunction.getGlobalTermEvidence();
                termEvidence.cf = RetrievalEnvironment.defaultCf;
                termEvidence.df = RetrievalEnvironment.defaultDf;

                scoringFunction.initialize(termEvidence, scoringFunction.getGlobalEvidence());
                docScore_cascade += clique_wgts[j] * scoringFunction.getScore(matches, docLen);

              } // end else it's proximity feature
            } // end for (each concept)

            // accumulate doc score in results[i] across cascade stages
            results[i].score += docScore_cascade;

            mSortedAccumulators.add(results[i]);

            sumScore += results[i].score;

          } // end if passed pruning
        } // end iterating over docs

        // order based on new scores in results[], put into priority queue
        if (size != mSortedAccumulators.size()) {
          throw new RetrievalException("They should be equal right here " + size + " "
              + mSortedAccumulators.size());
        }

        CascadeAccumulator[] results_tmp = new CascadeAccumulator[size];

        meanScore = sumScore / (float) size; // update stats for use in pruning in next cascade stage
        stddev = 0;

        for (int i = 0; i < results_tmp.length; i++) {
          results_tmp[results_tmp.length - 1 - i] = mSortedAccumulators.poll();

          stddev += (results_tmp[results_tmp.length - 1 - i].score - meanScore)
              * (results_tmp[results_tmp.length - 1 - i].score - meanScore);
        }
        results = results_tmp;

        stddev = (float) Math.sqrt(stddev);

        // Create single pool of reusable accumulators.
        // Use mNumResults from prev iteration, since we don't know how many docs are kept until
        // we're done iterating through the documents

        cascadeStage++;

        subTotal_cascadeCost = subTotal_cascadeCost * size;

      } // end if not first stage

      cascadeCost += subTotal_cascadeCost;

    } // end while

    CascadeAccumulator[] results_return = results;

    if (results.length > mK) {
      results_return = new CascadeAccumulator[mK];

      for (int i = 0; i < mK; i++) {
        results_return[i] = new CascadeAccumulator(results[i].docno, results[i].score);
      }
    }

    return results_return;
  }

  public CascadeAccumulator[] executeInitialStage() {

    // point to next position in keptDocs array that hasn't been filled
    int indexCntKeptDocs = 0;

    // Clear priority queue.
    mSortedAccumulators.clear();

    // Cliques associated with the MRF.
    List<Clique> cliques = mMRF.getCliques();

    if (cliques.size() == 0) {
      throw new RetrievalException("Shouldn't have size 0!");
    }

    // Current accumulator.
    CascadeAccumulator a = mAccumulators[0];

    // Maximum possible score that this MRF can achieve.
    float mrfMaxScore = 0.0f;
    for (Clique c : cliques) {
      if (!((((CascadeClique) c).getParamID()).equals("termWt"))) {
        System.out
            .println("In this faster cascade implementation, first stage must be term in order to get positions[] values! "
                + ((CascadeClique) c).getParamID());
        System.exit(-1);
      }
      mrfMaxScore += c.getMaxScore();
    }

    // Sort cliques according to their max scores.
    Collections.sort(cliques, maxScoreComparator);

    // Score that must be achieved to enter result set.
    double scoreThreshold = Double.NEGATIVE_INFINITY;

    // Offset into document set we're currently at (if applicable).
    int docsetOffset = 0;

    int docno = 0;
    if (mDocSet != null) {
      docno = docsetOffset < mDocSet.length ? mDocSet[docsetOffset++] : Integer.MAX_VALUE;
    } else {
      docno = mMRF.getNextCandidate();
    }

    boolean firstTime = true;

    while (docno < Integer.MAX_VALUE) {
      for (DocumentNode documentNode : mDocNodes) {
        documentNode.setDocno(docno);
      }

      // Document-at-a-time scoring.
      float docMaxScore = mrfMaxScore;
      boolean skipped = false;

      float score = 0.0f;

      // Lidan: accumulate document scores across the cascade stages
//      if (mDocSet != null && cascadeStage != 0) {
//        score = accumulated_scores[docsetOffset - 1];
//      }

      // for each query term, its position in a document
      int[][] termPositions = new int[cliques.size()][];
      int doclen = -1;

      for (int i = 0; i < cliques.size(); i++) {
        // Current clique that we're scoring.
        CascadeClique c = (CascadeClique) cliques.get(i);

        if (firstTime) {
          termToCliqueNumber.put(c.getConcept().trim().toLowerCase(), i);
          cf.put(c.getConcept().trim().toLowerCase(), c.termCollectionCF());
          df.put(c.getConcept().trim().toLowerCase(), c.termCollectionDF());
        }

        if (score + docMaxScore <= scoreThreshold) {
          // Advance postings readers (but don't score).
          for (int j = i; j < cliques.size(); j++) {
            cliques.get(j).setNextCandidate(docno + 1);
          }
          skipped = true;

          break;
        }

        // Document independent cliques do not affect the ranking.
        if (!c.isDocDependent()) {
          continue;
        }

        // Update document score.
        float cliqueScore = c.getPotential();
        score += c.getWeight() * cliqueScore;

        // Update the max score for the rest of the cliques.
        docMaxScore -= c.getMaxScore();

        // stuff needed for document evaluation in the next stage
        int[] p = c.getPositions();

        if (p != null) {
          termPositions[i] = Arrays.copyOf(p, p.length);
          doclen = c.getDocLen();
        }
      }

      firstTime = false;

      // Keep track of mNumResults best accumulators.
      if (!skipped && score > scoreThreshold) {
        a.docno = docno;
        a.score = score;
        a.index_into_keptDocs = indexCntKeptDocs;
        keptDocLengths[indexCntKeptDocs] = doclen;

        mSortedAccumulators.add(a);

        // save positional information for each query term in the document
        for (int j = 0; j < termPositions.length; j++) {

          if (termPositions[j] != null) {
            keptDocs[indexCntKeptDocs][j] = Arrays.copyOf(termPositions[j], termPositions[j].length);
          }
        }

        if (mSortedAccumulators.size() == mNumResults + 1) {
          a = mSortedAccumulators.poll(); // Re-use the accumulator of the removed document

          // After maximum # docs been put into queue, each time a new document is added, an old
          // document will be ejected, use the spot freed by the ejected document to store the new
          // document positional info in keptDocs

          indexCntKeptDocs = a.index_into_keptDocs;
          keptDocs[indexCntKeptDocs] = new int[numQueryTerms][];

          scoreThreshold = mSortedAccumulators.peek().score;

        } else {
          a = mAccumulators[mSortedAccumulators.size()]; // Next non-used accumulator in the
                                                         // accumulator pool
          indexCntKeptDocs++;
        }

      }

      if (mDocSet != null) {
        docno = docsetOffset < mDocSet.length ? mDocSet[docsetOffset++] : Integer.MAX_VALUE;
      } else {
        docno = mMRF.getNextCandidate();
      }
    }

    // Grab the accumulators off the stack, in (reverse) order.
    CascadeAccumulator[] results_tmp = new CascadeAccumulator[Math.min(mNumResults,
        mSortedAccumulators.size())];

    for (int i = 0; i < results_tmp.length; i++) {
      results_tmp[results_tmp.length - 1 - i] = mSortedAccumulators.poll();
      meanScore += results_tmp[results_tmp.length - 1 - i].score;
    }

    meanScore /= results_tmp.length;

    CascadeAccumulator[] results = results_tmp;

    return results;
  }

  /**
   * Returns the Markov Random Field associated with this ranker.
   */
  public MarkovRandomField getMRF() {
    return mMRF;
  }

  /**
   * Sets the number of results to return.
   */
  public void setNumResults(int numResults) {
    mNumResults = numResults;
  }

  private List<DocumentNode> getDocNodes() {
    ArrayList<DocumentNode> docNodes = new ArrayList<DocumentNode>();

    // Check which of the nodes are DocumentNodes.
    List<GraphNode> nodes = mMRF.getNodes();
    for (GraphNode node : nodes) {
      if (node.getType() == GraphNode.Type.DOCUMENT) {
        docNodes.add((DocumentNode) node);
      }
    }
    return docNodes;
  }
}
TOP

Related Classes of ivory.cascade.retrieval.CascadeEval

TOP
Copyright © 2018 www.massapi.com. All rights reserved.
All source code are property of their respective owners. Java is a trademark of Sun Microsystems, Inc and owned by ORACLE Inc. Contact coftware#gmail.com.