Package org.apache.lucene.facet.sampling

Source Code of org.apache.lucene.facet.sampling.TakmiSampleFixer

package org.apache.lucene.facet.sampling;

import java.io.IOException;

import org.apache.lucene.facet.old.ScoredDocIDs;
import org.apache.lucene.facet.old.ScoredDocIDsIterator;
import org.apache.lucene.facet.params.FacetSearchParams;
import org.apache.lucene.facet.search.DrillDownQuery;
import org.apache.lucene.facet.search.FacetResultNode;
import org.apache.lucene.facet.taxonomy.CategoryPath;
import org.apache.lucene.facet.taxonomy.TaxonomyReader;
import org.apache.lucene.index.DocsEnum;
import org.apache.lucene.index.IndexReader;
import org.apache.lucene.index.MultiFields;
import org.apache.lucene.index.Term;
import org.apache.lucene.search.DocIdSetIterator;
import org.apache.lucene.util.Bits;

/*
* Licensed to the Apache Software Foundation (ASF) under one or more
* contributor license agreements.  See the NOTICE file distributed with
* this work for additional information regarding copyright ownership.
* The ASF licenses this file to You under the Apache License, Version 2.0
* (the "License"); you may not use this file except in compliance with
* the License.  You may obtain a copy of the License at
*
*     http://www.apache.org/licenses/LICENSE-2.0
*
* Unless required by applicable law or agreed to in writing, software
* distributed under the License is distributed on an "AS IS" BASIS,
* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
* See the License for the specific language governing permissions and
* limitations under the License.
*/

/**
* Fix sampling results by correct results, by counting the intersection between
* two lists: a TermDocs (list of documents in a certain category) and a
* DocIdSetIterator (list of documents matching the query).
* <p>
* This fixer is suitable for scenarios which prioritize accuracy over
* performance.
* <p>
* <b>Note:</b> for statistically more accurate top-k selection, set
* {@link SamplingParams#setOversampleFactor(double) oversampleFactor} to at
* least 2, so that the top-k categories would have better chance of showing up
* in the sampled top-cK results (see {@link SamplingParams#getOversampleFactor}
*
* @lucene.experimental
*/
public class TakmiSampleFixer extends SampleFixer {
 
  private TaxonomyReader taxonomyReader;
  private IndexReader indexReader;
  private FacetSearchParams searchParams;
 
  public TakmiSampleFixer(IndexReader indexReader,
      TaxonomyReader taxonomyReader, FacetSearchParams searchParams) {
    this.indexReader = indexReader;
    this.taxonomyReader = taxonomyReader;
    this.searchParams = searchParams;
  }

  @Override
  public void singleNodeFix(FacetResultNode facetResNode, ScoredDocIDs docIds, double samplingRatio) throws IOException {
    recount(facetResNode, docIds);
  }
 
  /**
   * Internal utility: recount for a facet result node
   *
   * @param fresNode
   *          result node to be recounted
   * @param docIds
   *          full set of matching documents.
   * @throws IOException If there is a low-level I/O error.
   */
  private void recount(FacetResultNode fresNode, ScoredDocIDs docIds) throws IOException {
    // TODO (Facet): change from void to return the new, smaller docSet, and use
    // that for the children, as this will make their intersection ops faster.
    // can do this only when the new set is "sufficiently" smaller.
   
    /* We need the category's path name in order to do its recounting.
     * If it is missing, because the option to label only part of the
     * facet results was exercise, we need to calculate them anyway, so
     * in essence sampling with recounting spends some extra cycles for
     * labeling results for which labels are not required. */
    if (fresNode.label == null) {
      fresNode.label = taxonomyReader.getPath(fresNode.ordinal);
    }
    CategoryPath catPath = fresNode.label;

    Term drillDownTerm = DrillDownQuery.term(searchParams.indexingParams, catPath);
    // TODO (Facet): avoid Multi*?
    Bits liveDocs = MultiFields.getLiveDocs(indexReader);
    int updatedCount = countIntersection(MultiFields.getTermDocsEnum(indexReader, liveDocs,
                                                                     drillDownTerm.field(), drillDownTerm.bytes(),
                                                                     0), docIds.iterator());
    fresNode.value = updatedCount;
  }

  /**
   * Count the size of the intersection between two lists: a TermDocs (list of
   * documents in a certain category) and a DocIdSetIterator (list of documents
   * matching a query).
   */
  private static int countIntersection(DocsEnum p1, ScoredDocIDsIterator p2)
      throws IOException {
    // The documentation of of both TermDocs and DocIdSetIterator claim
    // that we must do next() before doc(). So we do, and if one of the
    // lists is empty, obviously return 0;
    if (p1 == null || p1.nextDoc() == DocIdSetIterator.NO_MORE_DOCS) {
      return 0;
    }
    if (!p2.next()) {
      return 0;
    }
   
    int d1 = p1.docID();
    int d2 = p2.getDocID();

    int count = 0;
    for (;;) {
      if (d1 == d2) {
        ++count;
        if (p1.nextDoc() == DocIdSetIterator.NO_MORE_DOCS) {
          break; // end of list 1, nothing more in intersection
        }
        d1 = p1.docID();
        if (!advance(p2, d1)) {
          break; // end of list 2, nothing more in intersection
        }
        d2 = p2.getDocID();
      } else if (d1 < d2) {
        if (p1.advance(d2) == DocIdSetIterator.NO_MORE_DOCS) {
          break; // end of list 1, nothing more in intersection
        }
        d1 = p1.docID();
      } else /* d1>d2 */ {
        if (!advance(p2, d1)) {
          break; // end of list 2, nothing more in intersection
        }
        d2 = p2.getDocID();
      }
    }
    return count;
  }

  /**
   * utility: advance the iterator until finding (or exceeding) specific
   * document
   *
   * @param iterator
   *          iterator being advanced
   * @param targetDoc
   *          target of advancing
   * @return false if iterator exhausted, true otherwise.
   */
  private static boolean advance(ScoredDocIDsIterator iterator, int targetDoc) {
    while (iterator.next()) {
      if (iterator.getDocID() >= targetDoc) {
        return true; // target reached
      }
    }
    return false; // exhausted
  }

}
TOP

Related Classes of org.apache.lucene.facet.sampling.TakmiSampleFixer

TOP
Copyright © 2018 www.massapi.com. All rights reserved.
All source code are property of their respective owners. Java is a trademark of Sun Microsystems, Inc and owned by ORACLE Inc. Contact coftware#gmail.com.