Examples of org.apache.lucene.index.SortedSetDocValues

org.apache.lucene.index.SortedSetDocValues
A per-document set of presorted byte[] values.
Per-Document values in a SortedDocValues are deduplicated, dereferenced, and sorted into a dictionary of unique values. A pointer to the dictionary value (ordinal) can be retrieved for each document. Ordinals are dense and in increasing sorted order.

    final NumericDocValues docToOrdAddress = instance.docToOrdAddress;
    final NumericDocValues ords = instance.ords;
    final BinaryDocValues values = instance.values;


    // Must make a new instance since the iterator has state:
    return new SortedSetDocValues() {
      int ordUpto;
      int ordLimit;


      @Override
      public long nextOrd() {

View Full Code Here

    final Arc<Long> scratchArc = new Arc<Long>();
    final IntsRef scratchInts = new IntsRef();
    final BytesRefFSTEnum<Long> fstEnum = new BytesRefFSTEnum<Long>(fst); 
    final BytesRef ref = new BytesRef();
    final ByteArrayDataInput input = new ByteArrayDataInput();
    return new SortedSetDocValues() {
      long currentOrd;


      @Override
      public long nextOrd() {
        if (input.eof()) {

View Full Code Here

     * Returns a DocIdSet with documents that should be permitted in search
     * results.
     */
    @Override
    public DocIdSet getDocIdSet(AtomicReaderContext context, final Bits acceptDocs) throws IOException {
      final SortedSetDocValues docTermOrds = FieldCache.DEFAULT.getDocTermOrds(context.reader(), query.field);
      // Cannot use FixedBitSet because we require long index (ord):
      final OpenBitSet termSet = new OpenBitSet(docTermOrds.getValueCount());
      TermsEnum termsEnum = query.getTermsEnum(new Terms() {
        
        @Override
        public Comparator<BytesRef> getComparator() {
          return BytesRef.getUTF8SortedAsUnicodeComparator();
        }
        
        @Override
        public TermsEnum iterator(TermsEnum reuse) {
          return docTermOrds.termsEnum();
        }


        @Override
        public long getSumTotalTermFreq() {
          return -1;
        }


        @Override
        public long getSumDocFreq() {
          return -1;
        }


        @Override
        public int getDocCount() {
          return -1;
        }


        @Override
        public long size() {
          return -1;
        }


        @Override
        public boolean hasFreqs() {
          return false;
        }


        @Override
        public boolean hasOffsets() {
          return false;
        }


        @Override
        public boolean hasPositions() {
          return false;
        }
        
        @Override
        public boolean hasPayloads() {
          return false;
        }
      });
      
      assert termsEnum != null;
      if (termsEnum.next() != null) {
        // fill into a OpenBitSet
        do {
          termSet.set(termsEnum.ord());
        } while (termsEnum.next() != null);
      } else {
        return null;
      }
      
      return new FieldCacheDocIdSet(context.reader().maxDoc(), acceptDocs) {
        @Override
        protected final boolean matchDoc(int doc) throws ArrayIndexOutOfBoundsException {
          docTermOrds.setDocument(doc);
          long ord;
          // TODO: we could track max bit set and early terminate (since they come in sorted order)
          while ((ord = docTermOrds.nextOrd()) != SortedSetDocValues.NO_MORE_ORDS) {
            if (termSet.get(ord)) {
              return true;
            }
          }
          return false;

View Full Code Here

    final LongBinaryDocValues binary = (LongBinaryDocValues) getBinary(field);
    final LongValues ordinals = getNumeric(ords.get(field.number));
    // but the addresses to the ord stream are in RAM
    final MonotonicBlockPackedReader ordIndex = getOrdIndexInstance(data, field, ordIndexes.get(field.number));
    
    return new SortedSetDocValues() {
      long offset;
      long endOffset;
      
      @Override
      public long nextOrd() {

View Full Code Here

    final Arc<Long> scratchArc = new Arc<Long>();
    final IntsRef scratchInts = new IntsRef();
    final BytesRefFSTEnum<Long> fstEnum = new BytesRefFSTEnum<Long>(fst); 
    final BytesRef ref = new BytesRef();
    final ByteArrayDataInput input = new ByteArrayDataInput();
    return new SortedSetDocValues() {
      long currentOrd;


      @Override
      public long nextOrd() {
        if (input.eof()) {

View Full Code Here

  }


  // TODO: this if DocTermsIndex was already created, we
  // should share it...
  public SortedSetDocValues getDocTermOrds(AtomicReader reader, String field) throws IOException {
    SortedSetDocValues dv = reader.getSortedSetDocValues(field);
    if (dv != null) {
      return dv;
    }
    
    SortedDocValues sdv = reader.getSortedDocValues(field);

View Full Code Here


    final IndexInput in = data.clone();
    final BytesRef scratch = new BytesRef();
    final DecimalFormat decoder = new DecimalFormat(field.pattern, new DecimalFormatSymbols(Locale.ROOT));
    
    return new SortedSetDocValues() {
      String[] currentOrds = new String[0];
      int currentIndex = 0;
      
      @Override
      public long nextOrd() {

View Full Code Here

      // AIOOBE can happen:
      if (ReaderUtil.getTopLevelContext(hits.context).reader() != origReader) {
        throw new IllegalStateException("the SortedSetDocValuesReaderState provided to this class does not match the reader being searched; you must create a new SortedSetDocValuesReaderState every time you open a new IndexReader");
      }
      
      SortedSetDocValues segValues = reader.getSortedSetDocValues(field);
      if (segValues == null) {
        continue;
      }


      DocIdSetIterator docs = hits.bits.iterator();


      // TODO: yet another option is to count all segs
      // first, only in seg-ord space, and then do a
      // merge-sort-PQ in the end to only "resolve to
      // global" those seg ords that can compete, if we know
      // we just want top K?  ie, this is the same algo
      // that'd be used for merging facets across shards
      // (distributed faceting).  but this has much higher
      // temp ram req'ts (sum of number of ords across all
      // segs)
      if (ordinalMap != null) {
        int segOrd = hits.context.ord;


        int numSegOrds = (int) segValues.getValueCount();


        if (hits.totalHits < numSegOrds/10) {
          //System.out.println("    remap as-we-go");
          // Remap every ord to global ord as we iterate:
          int doc;
          while ((doc = docs.nextDoc()) != DocIdSetIterator.NO_MORE_DOCS) {
            //System.out.println("    doc=" + doc);
            segValues.setDocument(doc);
            int term = (int) segValues.nextOrd();
            while (term != SortedSetDocValues.NO_MORE_ORDS) {
              //System.out.println("      segOrd=" + segOrd + " ord=" + term + " globalOrd=" + ordinalMap.getGlobalOrd(segOrd, term));
              counts[(int) ordinalMap.getGlobalOrd(segOrd, term)]++;
              term = (int) segValues.nextOrd();
            }
          }
        } else {
          //System.out.println("    count in seg ord first");


          // First count in seg-ord space:
          final int[] segCounts = new int[numSegOrds];
          int doc;
          while ((doc = docs.nextDoc()) != DocIdSetIterator.NO_MORE_DOCS) {
            //System.out.println("    doc=" + doc);
            segValues.setDocument(doc);
            int term = (int) segValues.nextOrd();
            while (term != SortedSetDocValues.NO_MORE_ORDS) {
              //System.out.println("      ord=" + term);
              segCounts[term]++;
              term = (int) segValues.nextOrd();
            }
          }


          // Then, migrate to global ords:
          for(int ord=0;ord<numSegOrds;ord++) {
            int count = segCounts[ord];
            if (count != 0) {
              //System.out.println("    migrate segOrd=" + segOrd + " ord=" + ord + " globalOrd=" + ordinalMap.getGlobalOrd(segOrd, ord));
              counts[(int) ordinalMap.getGlobalOrd(segOrd, ord)] += count;
            }
          }
        }
      } else {
        // No ord mapping (e.g., single segment index):
        // just aggregate directly into counts:
        int doc;
        while ((doc = docs.nextDoc()) != DocIdSetIterator.NO_MORE_DOCS) {
          segValues.setDocument(doc);
          int term = (int) segValues.nextOrd();
          while (term != SortedSetDocValues.NO_MORE_ORDS) {
            counts[term]++;
            term = (int) segValues.nextOrd();
          }
        }
      }
    }
  }

View Full Code Here

    final LongBinaryDocValues binary = (LongBinaryDocValues) getBinary(field);
    final LongValues ordinals = getNumeric(ords.get(field.number));
    // but the addresses to the ord stream are in RAM
    final MonotonicBlockPackedReader ordIndex = getOrdIndexInstance(data, field, ordIndexes.get(field.number));
    
    return new SortedSetDocValues() {
      long offset;
      long endOffset;
      
      @Override
      public long nextOrd() {

View Full Code Here

    this.origReader = reader;


    // We need this to create thread-safe MultiSortedSetDV
    // per collector:
    topReader = SlowCompositeReaderWrapper.wrap(reader);
    SortedSetDocValues dv = topReader.getSortedSetDocValues(field);
    if (dv == null) {
      throw new IllegalArgumentException("field \"" + field + "\" was not indexed with SortedSetDocValues");
    }
    if (dv.getValueCount() > Integer.MAX_VALUE) {
      throw new IllegalArgumentException("can only handle valueCount < Integer.MAX_VALUE; got " + dv.getValueCount());
    }
    valueCount = (int) dv.getValueCount();


    // TODO: we can make this more efficient if eg we can be
    // "involved" when OrdinalMap is being created?  Ie see
    // each term/ord it's assigning as it goes...
    String lastDim = null;
    int startOrd = -1;
    BytesRef spare = new BytesRef();


    // TODO: this approach can work for full hierarchy?;
    // TaxoReader can't do this since ords are not in
    // "sorted order" ... but we should generalize this to
    // support arbitrary hierarchy:
    for(int ord=0;ord<valueCount;ord++) {
      dv.lookupOrd(ord, spare);
      String[] components = FacetsConfig.stringToPath(spare.utf8ToString());
      if (components.length != 2) {
        throw new IllegalArgumentException("this class can only handle 2 level hierarchy (dim/value); got: " + Arrays.toString(components) + " " + spare.utf8ToString());
      }
      if (!components[0].equals(lastDim)) {

View Full Code Here

0 1 2 3 4 5 6 7 8 9

TOP

Related Classes of org.apache.lucene.index.SortedSetDocValues

org.apache.lucene.codecs.asserting.AssertingDocValuesFormat$AssertingDocValuesProducer

org.apache.lucene.codecs.cheapbastard.CheapBastardDocValuesProducer

org.apache.lucene.codecs.diskdv.DiskDocValuesProducer

org.apache.lucene.codecs.DocValuesConsumer

org.apache.lucene.codecs.lucene42.Lucene42DocValuesProducer

org.apache.lucene.codecs.lucene45.Lucene45DocValuesProducer

org.apache.lucene.codecs.memory.DirectDocValuesProducer

org.apache.lucene.codecs.memory.MemoryDocValuesProducer

org.apache.lucene.codecs.simpletext.SimpleTextDocValuesReader

org.apache.lucene.facet.sortedset.DefaultSortedSetDocValuesReaderState

All source code are property of their respective owners. Java is a trademark of Sun Microsystems, Inc and owned by ORACLE Inc. Contact coftware#gmail.com.