Source Code of org.apache.lucene.facet.util.TestFacetsPayloadMigrationReader$PayloadFacetFields$CountingListStream

package org.apache.lucene.facet.util;


import java.io.IOException;
import java.util.ArrayList;
import java.util.Collections;
import java.util.HashMap;
import java.util.HashSet;
import java.util.Iterator;
import java.util.List;
import java.util.Map;
import java.util.Map.Entry;
import java.util.Random;


import org.apache.lucene.analysis.MockAnalyzer;
import org.apache.lucene.analysis.TokenStream;
import org.apache.lucene.analysis.tokenattributes.CharTermAttribute;
import org.apache.lucene.analysis.tokenattributes.PayloadAttribute;
import org.apache.lucene.document.Document;
import org.apache.lucene.document.Field;
import org.apache.lucene.document.Field.Store;
import org.apache.lucene.document.FieldType;
import org.apache.lucene.document.StringField;
import org.apache.lucene.document.TextField;
import org.apache.lucene.facet.FacetTestCase;
import org.apache.lucene.facet.index.FacetFields;
import org.apache.lucene.facet.params.CategoryListParams;
import org.apache.lucene.facet.params.CategoryListParams.OrdinalPolicy;
import org.apache.lucene.facet.params.FacetIndexingParams;
import org.apache.lucene.facet.params.FacetSearchParams;
import org.apache.lucene.facet.params.PerDimensionIndexingParams;
import org.apache.lucene.facet.params.PerDimensionOrdinalPolicy;
import org.apache.lucene.facet.search.CategoryListIterator;
import org.apache.lucene.facet.search.CountFacetRequest;
import org.apache.lucene.facet.search.DrillDownQuery;
import org.apache.lucene.facet.search.FacetRequest;
import org.apache.lucene.facet.search.FacetResult;
import org.apache.lucene.facet.search.FacetResultNode;
import org.apache.lucene.facet.search.FacetsCollector;
import org.apache.lucene.facet.taxonomy.CategoryPath;
import org.apache.lucene.facet.taxonomy.TaxonomyReader;
import org.apache.lucene.facet.taxonomy.TaxonomyWriter;
import org.apache.lucene.facet.taxonomy.directory.DirectoryTaxonomyReader;
import org.apache.lucene.facet.taxonomy.directory.DirectoryTaxonomyWriter;
import org.apache.lucene.index.AtomicReader;
import org.apache.lucene.index.AtomicReaderContext;
import org.apache.lucene.index.DirectoryReader;
import org.apache.lucene.index.FieldInfo;
import org.apache.lucene.index.FieldInfo.IndexOptions;
import org.apache.lucene.index.FieldInfos;
import org.apache.lucene.index.IndexWriter;
import org.apache.lucene.index.IndexWriterConfig;
import org.apache.lucene.index.MultiReader;
import org.apache.lucene.index.NoMergePolicy;
import org.apache.lucene.index.Term;
import org.apache.lucene.index.Terms;
import org.apache.lucene.index.TermsEnum;
import org.apache.lucene.search.IndexSearcher;
import org.apache.lucene.search.MatchAllDocsQuery;
import org.apache.lucene.search.MultiCollector;
import org.apache.lucene.search.PrefixQuery;
import org.apache.lucene.search.TotalHitCountCollector;
import org.apache.lucene.store.Directory;
import org.apache.lucene.util.BytesRef;
import org.apache.lucene.util.IOUtils;
import org.apache.lucene.util.IntsRef;
import org.junit.Test;


/*
 * Licensed to the Apache Software Foundation (ASF) under one or more
 * contributor license agreements.  See the NOTICE file distributed with
 * this work for additional information regarding copyright ownership.
 * The ASF licenses this file to You under the Apache License, Version 2.0
 * (the "License"); you may not use this file except in compliance with
 * the License.  You may obtain a copy of the License at
 *
 *     http://www.apache.org/licenses/LICENSE-2.0
 *
 * Unless required by applicable law or agreed to in writing, software
 * distributed under the License is distributed on an "AS IS" BASIS,
 * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 * See the License for the specific language governing permissions and
 * limitations under the License.
 */


/** Tests facets index migration from payload to DocValues.*/
public class TestFacetsPayloadMigrationReader extends FacetTestCase {
  
  private static class PayloadFacetFields extends FacetFields {


    private static final class CountingListStream extends TokenStream {
      private final PayloadAttribute payloadAtt = addAttribute(PayloadAttribute.class);
      private final CharTermAttribute termAtt = addAttribute(CharTermAttribute.class);
      private final Iterator<Entry<String,BytesRef>> categoriesData;
      
      CountingListStream(Map<String,BytesRef> categoriesData) {
        this.categoriesData = categoriesData.entrySet().iterator();
      }
      
      @Override
      public boolean incrementToken() throws IOException {
        if (!categoriesData.hasNext()) {
          return false;
        }
        
        Entry<String,BytesRef> entry = categoriesData.next();
        termAtt.setEmpty().append(FacetsPayloadMigrationReader.PAYLOAD_TERM_TEXT + entry.getKey());
        payloadAtt.setPayload(entry.getValue());
        return true;
      }
      
    }


    private static final FieldType COUNTING_LIST_PAYLOAD_TYPE = new FieldType();
    static {
      COUNTING_LIST_PAYLOAD_TYPE.setIndexed(true);
      COUNTING_LIST_PAYLOAD_TYPE.setTokenized(true);
      COUNTING_LIST_PAYLOAD_TYPE.setIndexOptions(IndexOptions.DOCS_AND_FREQS_AND_POSITIONS);
      COUNTING_LIST_PAYLOAD_TYPE.setStored(false);
      COUNTING_LIST_PAYLOAD_TYPE.setOmitNorms(true);
      COUNTING_LIST_PAYLOAD_TYPE.freeze();
    }
    
    public PayloadFacetFields(TaxonomyWriter taxonomyWriter, FacetIndexingParams params) {
      super(taxonomyWriter, params);
    }


    @Override
    protected FieldType drillDownFieldType() {
      // Since the payload is indexed in the same field as the drill-down terms,
      // we must set IndexOptions to DOCS_AND_FREQS_AND_POSITIONS
      final FieldType type = new FieldType(TextField.TYPE_NOT_STORED);
      type.setIndexOptions(IndexOptions.DOCS_AND_FREQS_AND_POSITIONS);
      type.freeze();
      return type;
    }


    @Override
    protected void addCountingListData(Document doc, Map<String,BytesRef> categoriesData, String field) {
      CountingListStream ts = new CountingListStream(categoriesData);
      doc.add(new Field(field, ts, COUNTING_LIST_PAYLOAD_TYPE));
    }
  }


  private static final String[] DIMENSIONS = new String[] { "dim1", "dim2", "dim3.1", "dim3.2" };
  
  private HashMap<String,Integer> createIndex(Directory indexDir, Directory taxoDir, FacetIndexingParams fip) 
      throws Exception {
    Random random = random();
    IndexWriterConfig conf = new IndexWriterConfig(TEST_VERSION_CURRENT, new MockAnalyzer(random));
    conf.setMaxBufferedDocs(2); // force few segments
    conf.setMergePolicy(NoMergePolicy.COMPOUND_FILES); // avoid merges so that we're left with few segments
    IndexWriter indexWriter = new IndexWriter(indexDir, conf);
    TaxonomyWriter taxoWriter = new DirectoryTaxonomyWriter(taxoDir);
    
    FacetFields facetFields = new PayloadFacetFields(taxoWriter, fip);
    
    HashMap<String,Integer> expectedCounts = new HashMap<String,Integer>(DIMENSIONS.length);
    int numDocs = atLeast(10);
    for (int i = 0; i < numDocs; i++) {
      Document doc = new Document();
      int numCategories = random.nextInt(3) + 1;
      ArrayList<CategoryPath> categories = new ArrayList<CategoryPath>(numCategories);
      HashSet<String> docDimensions = new HashSet<String>();
      while (numCategories-- > 0) {
        String dim = DIMENSIONS[random.nextInt(DIMENSIONS.length)];
        // we should only increment the expected count by 1 per document
        docDimensions.add(dim);
        categories.add(new CategoryPath(dim, Integer.toString(i), Integer.toString(numCategories)));
      }
      facetFields.addFields(doc, categories);
      doc.add(new StringField("docid", Integer.toString(i), Store.YES));
      doc.add(new TextField("foo", "content" + i, Store.YES));
      indexWriter.addDocument(doc);


      // update expected count per dimension
      for (String dim : docDimensions) {
        Integer val = expectedCounts.get(dim);
        if (val == null) {
          expectedCounts.put(dim, Integer.valueOf(1));
        } else {
          expectedCounts.put(dim, Integer.valueOf(val.intValue() + 1));
        }
      }
      
      if (random.nextDouble() < 0.2) { // add some documents that will be deleted
        doc = new Document();
        doc.add(new StringField("del", "key", Store.NO));
        facetFields.addFields(doc, Collections.singletonList(new CategoryPath("dummy")));
        indexWriter.addDocument(doc);
      }
    }
    
    indexWriter.commit();
    taxoWriter.commit();


    // delete the docs that were marked for deletion. note that the 'dummy'
    // category is not removed from the taxonomy, so must account for it when we
    // verify the migrated index.
    indexWriter.deleteDocuments(new Term("del", "key"));
    indexWriter.commit();
    
    IOUtils.close(indexWriter, taxoWriter);
    
    return expectedCounts;
  }
  
  private void migrateIndex(Directory indexDir, FacetIndexingParams fip) throws Exception {
    final Map<String,Term> fieldTerms = FacetsPayloadMigrationReader.buildFieldTermsMap(indexDir, fip);
    DirectoryReader reader = DirectoryReader.open(indexDir);
    List<AtomicReaderContext> leaves = reader.leaves();
    int numReaders = leaves.size();
    AtomicReader wrappedLeaves[] = new AtomicReader[numReaders];
    for (int i = 0; i < numReaders; i++) {
      wrappedLeaves[i] = new FacetsPayloadMigrationReader(leaves.get(i).reader(), fieldTerms);
    }
    
    IndexWriter writer = new IndexWriter(indexDir, newIndexWriterConfig(TEST_VERSION_CURRENT, null));
    writer.deleteAll();
    try {
      writer.addIndexes(new MultiReader(wrappedLeaves));
      writer.commit();
    } finally {
      reader.close();
      writer.close();
    }
  }
  
  private void verifyMigratedIndex(Directory indexDir, Directory taxoDir, HashMap<String,Integer> expectedCounts, 
      FacetIndexingParams fip) throws Exception {
    DirectoryReader indexReader = DirectoryReader.open(indexDir);
    TaxonomyReader taxoReader = new DirectoryTaxonomyReader(taxoDir);
    IndexSearcher searcher = newSearcher(indexReader);


    assertFalse("index should not have deletions", indexReader.hasDeletions());
    
    verifyNotFacetsData(indexReader, searcher);
    verifyFacetedSearch(expectedCounts, fip, indexReader, taxoReader, searcher);
    verifyDrillDown(expectedCounts, fip, indexReader, taxoReader, searcher);
    verifyIndexOrdinals(indexReader, taxoReader, fip);
    
    IOUtils.close(indexReader, taxoReader);
  }
  
  private void verifyNotFacetsData(DirectoryReader indexReader, IndexSearcher searcher) throws IOException {
    // verify that non facets data was not damaged
    TotalHitCountCollector total = new TotalHitCountCollector();
    searcher.search(new PrefixQuery(new Term("foo", "content")), total);
    assertEquals("invalid number of results for content query", total.getTotalHits(), indexReader.maxDoc());
    
    int numDocIDs = 0;
    for (AtomicReaderContext context : indexReader.leaves()) {
      Terms docIDs = context.reader().terms("docid");
      assertNotNull(docIDs);
      TermsEnum te = docIDs.iterator(null);
      while (te.next() != null) {
        ++numDocIDs;
      }
    }
    assertEquals("invalid number of docid terms", indexReader.maxDoc(), numDocIDs);
  }
  
  private void verifyFacetedSearch(Map<String,Integer> expectedCounts, FacetIndexingParams fip, 
      DirectoryReader indexReader, TaxonomyReader taxoReader, IndexSearcher searcher) throws IOException {
    // run faceted search and assert expected counts
    ArrayList<FacetRequest> requests = new ArrayList<FacetRequest>(expectedCounts.size());
    for (String dim : expectedCounts.keySet()) {
      requests.add(new CountFacetRequest(new CategoryPath(dim), 5));
    }
    FacetSearchParams fsp = new FacetSearchParams(fip, requests);
    FacetsCollector fc = FacetsCollector.create(fsp, indexReader, taxoReader);
    MatchAllDocsQuery base = new MatchAllDocsQuery();
    searcher.search(base, fc);
    List<FacetResult> facetResults = fc.getFacetResults();
    assertEquals(requests.size(), facetResults.size());
    for (FacetResult res : facetResults) {
      FacetResultNode node = res.getFacetResultNode();
      String dim = node.label.components[0];
      assertEquals("wrong count for " + dim, expectedCounts.get(dim).intValue(), (int) node.value);
    }
  }
  
  private void verifyDrillDown(Map<String,Integer> expectedCounts, FacetIndexingParams fip, DirectoryReader indexReader, 
      TaxonomyReader taxoReader, IndexSearcher searcher) throws IOException {
    // verify drill-down
    for (String dim : expectedCounts.keySet()) {
      CategoryPath drillDownCP = new CategoryPath(dim);
      FacetSearchParams fsp = new FacetSearchParams(fip, new CountFacetRequest(drillDownCP, 10));
      DrillDownQuery drillDown = new DrillDownQuery(fip, new MatchAllDocsQuery());
      drillDown.add(drillDownCP);
      TotalHitCountCollector total = new TotalHitCountCollector();
      FacetsCollector fc = FacetsCollector.create(fsp, indexReader, taxoReader);
      searcher.search(drillDown, MultiCollector.wrap(fc, total));
      assertTrue("no results for drill-down query " + drillDown, total.getTotalHits() > 0);
      List<FacetResult> facetResults = fc.getFacetResults();
      assertEquals(1, facetResults.size());
      FacetResultNode rootNode = facetResults.get(0).getFacetResultNode();
      assertEquals("wrong count for " + dim, expectedCounts.get(dim).intValue(), (int) rootNode.value);
    }
  }
  
  private void verifyIndexOrdinals(DirectoryReader indexReader, TaxonomyReader taxoReader, FacetIndexingParams fip) 
      throws IOException {
    // verify that the ordinals in the index match the ones in the taxonomy, and vice versa
    
    // collect all fields which have DocValues, to assert later that all were
    // visited i.e. that during migration we didn't add FieldInfos with no
    // DocValues
    HashSet<String> docValuesFields = new HashSet<String>();
    for (AtomicReaderContext context : indexReader.leaves()) {
      FieldInfos infos = context.reader().getFieldInfos();
      for (FieldInfo info : infos) {
        if (info.hasDocValues()) {
          docValuesFields.add(info.name);
        }
      }
    }
    
    // check that all visited ordinals are found in the taxonomy and vice versa
    boolean[] foundOrdinals = new boolean[taxoReader.getSize()];
    for (int i = 0; i < foundOrdinals.length; i++) {
      foundOrdinals[i] = false; // init to be on the safe side
    }
    foundOrdinals[0] = true; // ROOT ordinals isn't indexed
    // mark 'dummy' category ordinal as seen
    int dummyOrdinal = taxoReader.getOrdinal(new CategoryPath("dummy"));
    if (dummyOrdinal > 0) {
      foundOrdinals[dummyOrdinal] = true;
    }
    
    int partitionSize = fip.getPartitionSize();
    int numPartitions = (int) Math.ceil(taxoReader.getSize() / (double) partitionSize);
    final IntsRef ordinals = new IntsRef(32);
    for (String dim : DIMENSIONS) {
      CategoryListParams clp = fip.getCategoryListParams(new CategoryPath(dim));
      int partitionOffset = 0;
      for (int partition = 0; partition < numPartitions; partition++, partitionOffset += partitionSize) {
        final CategoryListIterator cli = clp.createCategoryListIterator(partition);
        for (AtomicReaderContext context : indexReader.leaves()) {
          if (cli.setNextReader(context)) { // not all fields may exist in all segments
            // remove that field from the list of DocValues fields
            docValuesFields.remove(clp.field + PartitionsUtils.partitionName(partition));
            int maxDoc = context.reader().maxDoc();
            for (int doc = 0; doc < maxDoc; doc++) {
              cli.getOrdinals(doc, ordinals);
              for (int j = 0; j < ordinals.length; j++) {
                // verify that the ordinal is recognized by the taxonomy
                int ordinal = ordinals.ints[j] + partitionOffset;
                assertTrue("should not have received dummy ordinal (" + dummyOrdinal + ")", dummyOrdinal != ordinal);
                assertNotNull("missing category for ordinal " + ordinal, taxoReader.getPath(ordinal));
                foundOrdinals[ordinal] = true;
              }
            }
          }
        }
      }
    }
    
    assertTrue("some fields which have docValues were not visited: " + docValuesFields, docValuesFields.isEmpty());
    
    for (int i = 0; i < foundOrdinals.length; i++) {
      assertTrue("ordinal " + i + " not visited", foundOrdinals[i]);
    }
  }
  
  private void doTestMigration(final int partitionSize) throws Exception {
    // create a facets index with PayloadFacetFields and check it after migration
    Directory indexDir = newDirectory();
    Directory taxoDir = newDirectory();
    
    // set custom CLP fields for two dimensions and use the default ($facets) for the other two
    HashMap<CategoryPath,CategoryListParams> params = new HashMap<CategoryPath,CategoryListParams>();
    params.put(new CategoryPath(DIMENSIONS[0]), new CategoryListParams(DIMENSIONS[0]) {
      @Override
      public OrdinalPolicy getOrdinalPolicy(String dimension) {
        return OrdinalPolicy.ALL_PARENTS;
      }
    });
    params.put(new CategoryPath(DIMENSIONS[1]), new CategoryListParams(DIMENSIONS[1]) {
      @Override
      public OrdinalPolicy getOrdinalPolicy(String dimension) {
        return OrdinalPolicy.ALL_PARENTS;
      }
    });
    
    HashMap<String,OrdinalPolicy> policies = new HashMap<String,CategoryListParams.OrdinalPolicy>();
    policies.put(DIMENSIONS[2], OrdinalPolicy.ALL_PARENTS);
    policies.put(DIMENSIONS[3], OrdinalPolicy.ALL_PARENTS);
    FacetIndexingParams fip = new PerDimensionIndexingParams(params, new PerDimensionOrdinalPolicy(policies)) {
      @Override
      public int getPartitionSize() {
        return partitionSize;
      }
    };
    
    HashMap<String,Integer> expectedCounts = createIndex(indexDir, taxoDir, fip);
    migrateIndex(indexDir, fip);
    verifyMigratedIndex(indexDir, taxoDir, expectedCounts, fip);
    
    IOUtils.close(indexDir, taxoDir);
  }
  
  @Test
  public void testMigration() throws Exception {
    doTestMigration(Integer.MAX_VALUE);
  }
  
  @Test
  public void testMigrationWithPartitions() throws Exception {
    doTestMigration(2);
  }
  
}
Source Code of org.apache.lucene.facet.util.TestFacetsPayloadMigrationReader$PayloadFacetFields$CountingListStream

Related Classes of org.apache.lucene.facet.util.TestFacetsPayloadMigrationReader$PayloadFacetFields$CountingListStream