Examples of org.apache.lucene.util.OpenBitSet

org.apache.lucene.util.OpenBitSet
An "open" BitSet implementation that allows direct access to the array of words storing the bits.
Unlike java.util.bitset, the fact that bits are packed into an array of longs is part of the interface. This allows efficient implementation of other algorithms by someone other than the author. It also allows one to efficiently implement alternate serialization or interchange formats.
OpenBitSet is faster than java.util.BitSet in most operations and *much* faster at calculating cardinality of sets and results of set operations. It can also handle sets of larger cardinality (up to 64 * 2**32-1)
The goals of OpenBitSet are the fastest implementation possible, and maximum code reuse. Extra safety and encapsulation may always be built on top, but if that's built in, the cost can never be removed (and hence people re-implement their own version in order to get better performance). If you want a "safe", totally encapsulated (and slower and limited) BitSet class, use java.util.BitSet.

Performance Results
Test system: Pentium 4, Sun Java 1.5_06 -server -Xbatch -Xmx64M
BitSet size = 1,000,000
Results are java.util.BitSet time divided by OpenBitSet time.

cardinality intersect_count union nextSetBit get iterator

50% full 3.36 3.96 1.44 1.46 1.99 1.58

1% full 3.31 3.90 1.04 0.99

Test system: AMD Opteron, 64 bit linux, Sun Java 1.5_06 -server -Xbatch -Xmx64M
BitSet size = 1,000,000
Results are java.util.BitSet time divided by OpenBitSet time.

cardinality intersect_count union nextSetBit get iterator

50% full 2.50 3.50 1.00 1.03 1.12 1.25

1% full 2.51 3.49 1.00 1.02

@version $Id$

	cardinality	intersect_count	union	nextSetBit	get	iterator
50% full	3.36	3.96	1.44	1.46	1.99	1.58
1% full	3.31	3.90		1.04		0.99

	cardinality	intersect_count	union	nextSetBit	get	iterator
50% full	2.50	3.50	1.00	1.03	1.12	1.25
1% full	2.51	3.49		1.00		1.02

        System.out.println("  s2 ch=0x" + Integer.toHexString(s2.charAt(idx)));
      }
      */
    }


    final OpenBitSet isS1 = new OpenBitSet(NUM_DOCS);
    for(int idx=0;idx<NUM_DOCS;idx++) {
      if (random.nextBoolean()) {
        isS1.set(idx);
      }
    }


    final IndexReader r;
    if (true) { 
      final IndexWriterConfig iwc = newIndexWriterConfig(TEST_VERSION_CURRENT, new MockAnalyzer(random))
        .setOpenMode(IndexWriterConfig.OpenMode.CREATE)
        .setMergePolicy(newLogMergePolicy());
      iwc.setRAMBufferSizeMB(16.0 + 16.0 * random.nextDouble());
      iwc.setMaxBufferedDocs(-1);
      final RandomIndexWriter riw = new RandomIndexWriter(random, dir, iwc);


      for(int idx=0;idx<NUM_DOCS;idx++) {
        final Document doc = new Document();
        String s = isS1.get(idx) ? s1 : s2;
        final Field f = newField("field", s, Field.Index.ANALYZED);
        final int count = _TestUtil.nextInt(random, 1, 4);
        for(int ct=0;ct<count;ct++) {
          doc.add(f);
        }
        riw.addDocument(doc);
      }


      r = riw.getReader();
      riw.close();
    } else {
      r = IndexReader.open(dir);
    }


    /*
    if (VERBOSE) {
      System.out.println("TEST: terms");
      TermEnum termEnum = r.terms();
      while(termEnum.next()) {
        System.out.println("  term=" + termEnum.term() + " len=" + termEnum.term().text().length());
        assertTrue(termEnum.docFreq() > 0);
        System.out.println("    s1?=" + (termEnum.term().text().equals(s1)) + " s1len=" + s1.length());
        System.out.println("    s2?=" + (termEnum.term().text().equals(s2)) + " s2len=" + s2.length());
        final String s = termEnum.term().text();
        for(int idx=0;idx<s.length();idx++) {
          System.out.println("      ch=0x" + Integer.toHexString(s.charAt(idx)));
        }
      }
    }
    */


    assertEquals(NUM_DOCS, r.numDocs());
    assertTrue(r.docFreq(new Term("field", s1)) > 0);
    assertTrue(r.docFreq(new Term("field", s2)) > 0);


    final byte[] payload = new byte[100];


    int num = atLeast(1000);
    for(int iter=0;iter<num;iter++) {


      final String term;
      final boolean doS1;
      if (random.nextBoolean()) {
        term = s1;
        doS1 = true;
      } else {
        term = s2;
        doS1 = false;
      }


      if (VERBOSE) {
        System.out.println("\nTEST: iter=" + iter + " doS1=" + doS1);
      }
        
      final TermPositions postings = r.termPositions(new Term("field", term));


      int docID = -1;
      while(docID < Integer.MAX_VALUE) {
        final int what = random.nextInt(3);
        if (what == 0) {
          if (VERBOSE) {
            System.out.println("TEST: docID=" + docID + "; do next()");
          }
          // nextDoc
          int expected = docID+1;
          while(true) {
            if (expected == NUM_DOCS) {
              expected = Integer.MAX_VALUE;
              break;
            } else if (isS1.get(expected) == doS1) {
              break;
            } else {
              expected++;
            }
          }
          boolean result = postings.next();
          if (!result) {
            assertEquals(Integer.MAX_VALUE, expected);
            if (VERBOSE) {
              System.out.println("  end");
            }
            break;
          } else {
            docID = postings.doc();
            if (VERBOSE) {
              System.out.println("  got docID=" + docID);
            }
            assertEquals(expected, docID);


            if (random.nextInt(6) == 3) {
              final int freq = postings.freq();
              assertTrue(freq >=1 && freq <= 4);
              for(int pos=0;pos<freq;pos++) {
                assertEquals(pos, postings.nextPosition());
                if (random.nextBoolean() && postings.isPayloadAvailable()) {
                  postings.getPayload(payload, 0);
                }
              }
            }
          }
        } else {
          // advance
          final int targetDocID;
          if (docID == -1) {
            targetDocID = random.nextInt(NUM_DOCS+1);
          } else {
            targetDocID = docID + _TestUtil.nextInt(random, 1, NUM_DOCS - docID);
          }
          if (VERBOSE) {
            System.out.println("TEST: docID=" + docID + "; do skipTo(" + targetDocID + ")");
          }
          int expected = targetDocID;
          while(true) {
            if (expected == NUM_DOCS) {
              expected = Integer.MAX_VALUE;
              break;
            } else if (isS1.get(expected) == doS1) {
              break;
            } else {
              expected++;
            }
          }

View Full Code Here

  }


  @Override
  public SpanFilterResult bitSpans(IndexReader reader) throws IOException {


    final OpenBitSet bits = new OpenBitSet(reader.maxDoc());
    Spans spans = query.getSpans(reader);
    List<SpanFilterResult.PositionInfo> tmp = new ArrayList<SpanFilterResult.PositionInfo>(20);
    int currentDoc = -1;
    SpanFilterResult.PositionInfo currentInfo = null;
    while (spans.next())
    {
      int doc = spans.doc();
      bits.set(doc);
      if (currentDoc != doc)
      {
        currentInfo = new SpanFilterResult.PositionInfo(doc);
        tmp.add(currentInfo);
        currentDoc = doc;

View Full Code Here

    assertDocIdSetCacheable(reader, FieldCacheRangeFilter.newIntRange("test", Integer.valueOf(10), Integer.valueOf(20), true, true), true);
    // a openbitset filter is always cacheable
    assertDocIdSetCacheable(reader, new Filter() {
      @Override
      public DocIdSet getDocIdSet(IndexReader reader) {
        return new OpenBitSet();
      }
    }, true);


    reader.close();
  }

View Full Code Here

    public SimpleDocIdSetFilter(int[] docs) {
      this.docs = docs;
    }
    @Override
    public DocIdSet getDocIdSet(IndexReader reader) {
      final OpenBitSet set = new OpenBitSet();
      final int limit = docBase+reader.maxDoc();
      for (;index < docs.length; index++) {
        final int docId = docs[index];
        if(docId > limit)
          break;
        set.set(docId-docBase);
      }
      docBase = limit;
      return set.isEmpty()?null:set;
    }

View Full Code Here

  
  /** Note: The neededBounds iterator must be unsigned (easier understanding what's happening) */
  private void assertLongRangeSplit(final long lower, final long upper, int precisionStep,
    final boolean useBitSet, final Iterator neededBounds, final Iterator neededShifts
  ) throws Exception {
    final OpenBitSet bits=useBitSet ? new OpenBitSet(upper-lower+1) : null;
    
    NumericUtils.splitLongRange(new NumericUtils.LongRangeBuilder() {
      //@Override
      public void addRange(long min, long max, int shift) {
        assertTrue("min, max should be inside bounds", min>=lower && min<=upper && max>=lower && max<=upper);
        if (useBitSet) for (long l=min; l<=max; l++) {
          assertFalse("ranges should not overlap", bits.getAndSet(l-lower) );
          // extra exit condition to prevent overflow on MAX_VALUE
          if (l == max) break;
        }
        if (neededBounds == null || neededShifts == null)
          return;
        // make unsigned longs for easier display and understanding
        min ^= 0x8000000000000000L;
        max ^= 0x8000000000000000L;
        //System.out.println("new Long(0x"+Long.toHexString(min>>>shift)+"L),new Long(0x"+Long.toHexString(max>>>shift)+"L)/*shift="+shift+"*/,");
        assertEquals( "shift", ((Integer)neededShifts.next()).intValue(), shift);
        assertEquals( "inner min bound", ((Long)neededBounds.next()).longValue(), min>>>shift);
        assertEquals( "inner max bound", ((Long)neededBounds.next()).longValue(), max>>>shift);
      }
    }, precisionStep, lower, upper);
    
    if (useBitSet) {
      // after flipping all bits in the range, the cardinality should be zero
      bits.flip(0,upper-lower+1);
      assertTrue("The sub-range concenated should match the whole range", bits.isEmpty());
    }
  }

View Full Code Here


  /** Note: The neededBounds iterator must be unsigned (easier understanding what's happening) */
  private void assertIntRangeSplit(final int lower, final int upper, int precisionStep,
    final boolean useBitSet, final Iterator neededBounds, final Iterator neededShifts
  ) throws Exception {
    final OpenBitSet bits=useBitSet ? new OpenBitSet(upper-lower+1) : null;
    
    NumericUtils.splitIntRange(new NumericUtils.IntRangeBuilder() {
      //@Override
      public void addRange(int min, int max, int shift) {
        assertTrue("min, max should be inside bounds", min>=lower && min<=upper && max>=lower && max<=upper);
        if (useBitSet) for (int i=min; i<=max; i++) {
          assertFalse("ranges should not overlap", bits.getAndSet(i-lower) );
          // extra exit condition to prevent overflow on MAX_VALUE
          if (i == max) break;
        }
        if (neededBounds == null)
          return;
        // make unsigned ints for easier display and understanding
        min ^= 0x80000000;
        max ^= 0x80000000;
        //System.out.println("new Integer(0x"+Integer.toHexString(min>>>shift)+"),new Integer(0x"+Integer.toHexString(max>>>shift)+")/*shift="+shift+"*/,");
        assertEquals( "shift", ((Integer)neededShifts.next()).intValue(), shift);
        assertEquals( "inner min bound", ((Integer)neededBounds.next()).intValue(), min>>>shift);
        assertEquals( "inner max bound", ((Integer)neededBounds.next()).intValue(), max>>>shift);
      }
    }, precisionStep, lower, upper);
    
    if (useBitSet) {
      // after flipping all bits in the range, the cardinality should be zero
      bits.flip(0,upper-lower+1);
      assertTrue("The sub-range concenated should match the whole range", bits.isEmpty());
    }
  }

View Full Code Here

        valueCount++;
      }
      
      int docCount = 0;
      long ordCount = 0;
      OpenBitSet seenOrds = new OpenBitSet(valueCount);
      Iterator<Number> ordIterator = ords.iterator();
      for (Number v : docToOrdCount) {
        assert v != null;
        int count = v.intValue();
        assert count >= 0;
        docCount++;
        ordCount += count;
        
        long lastOrd = -1;
        for (int i = 0; i < count; i++) {
          Number o = ordIterator.next();
          assert o != null;
          long ord = o.longValue();
          assert ord >= 0 && ord < valueCount;
          assert ord > lastOrd : "ord=" + ord + ",lastOrd=" + lastOrd;
          seenOrds.set(ord);
          lastOrd = ord;
        }
      }
      assert ordIterator.hasNext() == false;
      
      assert docCount == maxDoc;
      assert seenOrds.cardinality() == valueCount;
      checkIterator(values.iterator(), valueCount, false);
      checkIterator(docToOrdCount.iterator(), maxDoc, false);
      checkIterator(ords.iterator(), ordCount, false);
      in.addSortedSetField(field, values, docToOrdCount, ords);
    }

View Full Code Here

    }


    BigNestedIntArray nestedArray = new BigNestedIntArray();
    nestedArray.load(maxId, loader);


    OpenBitSet filter = new OpenBitSet(numVals);
    for (int i = 0; i < numVals; i++) {
      if (i % 2 == 0) {
        filter.set(i);
      }
    }


    for (int i = 0; i < maxId; i++) {
      nestedArray.countNoReturnWithFilter(i, count, filter);

View Full Code Here

    public void write(final FieldsConsumer consumer) throws Throwable {
      Arrays.sort(terms);
      final TermsConsumer termsConsumer = consumer.addField(fieldInfo);
      long sumTotalTermCount = 0;
      long sumDF = 0;
      OpenBitSet visitedDocs = new OpenBitSet();
      for (final TermData term : terms) {
        for (int i = 0; i < term.docs.length; i++) {
          visitedDocs.set(term.docs[i]);
        }
        sumDF += term.docs.length;
        sumTotalTermCount += term.write(termsConsumer);
      }
      termsConsumer.finish(omitTF ? -1 : sumTotalTermCount, sumDF, (int) visitedDocs.cardinality());
    }

View Full Code Here

    TermValueList<T> list = (listFactory == null ? (TermValueList<T>) new TermStringList()
        : listFactory.createTermList());
    IntArrayList minIDList = new IntArrayList();
    IntArrayList maxIDList = new IntArrayList();
    IntArrayList freqList = new IntArrayList();
    OpenBitSet bitset = new OpenBitSet(maxdoc + 1);
    int negativeValueCount = getNegativeValueCount(reader, field);
    int t = 1; // valid term id starts from 1
    list.add(null);
    minIDList.add(-1);
    maxIDList.add(-1);
    freqList.add(0);


    _overflow = false;


    String pre = null;


    int df = 0;
    int minID = -1;
    int maxID = -1;
    int docID = -1;
    int valId = 0;


    Terms terms = reader.terms(field);
    if (terms != null) {
      TermsEnum termsEnum = terms.iterator(null);
      BytesRef text;
      while ((text = termsEnum.next()) != null) {
        String strText = text.utf8ToString();
        String val = null;
        int weight = 0;
        String[] split = strText.split("\u0000");
        if (split.length > 1) {
          val = split[0];
          weight = Integer.parseInt(split[split.length - 1]);
        } else {
          continue;
        }


        if (pre == null || !val.equals(pre)) {
          if (pre != null) {
            freqList.add(df);
            minIDList.add(minID);
            maxIDList.add(maxID);
          }
          list.add(val);
          df = 0;
          minID = -1;
          maxID = -1;
          valId = (t - 1 < negativeValueCount) ? (negativeValueCount - t + 1) : t;
          t++;
        }


        Term term = new Term(field, strText);
        DocsEnum docsEnum = reader.termDocsEnum(term);
        if (docsEnum != null) {
          while ((docID = docsEnum.nextDoc()) != DocsEnum.NO_MORE_DOCS) {
            df++;


            if (!loader.add(docID, valId)) {
              logOverflow(fieldName);
            } else {
              weightLoader.add(docID, weight);
            }


            if (docID < minID) minID = docID;
            bitset.fastSet(docID);
            while (docsEnum.nextDoc() != DocsEnum.NO_MORE_DOCS) {
              docID = docsEnum.docID();
              df++;
              if (!loader.add(docID, valId)) {
                logOverflow(fieldName);
              } else {
                weightLoader.add(docID, weight);
              }
              bitset.fastSet(docID);
            }
            if (docID > maxID) maxID = docID;
          }
        }
        pre = val;
      }
      if (pre != null) {
        freqList.add(df);
        minIDList.add(minID);
        maxIDList.add(maxID);
      }
    }


    list.seal();
    // Process minIDList and maxIDList for negative number
    for (int i = 1; i < negativeValueCount/2 + 1; ++i) {
      int top = i;
      int tail = negativeValueCount - i + 1;
      int topValue = minIDList.getInt(top);
      int tailValue = minIDList.getInt(tail);
      minIDList.set(top, tailValue);
      minIDList.set(tail, topValue);
      topValue = maxIDList.getInt(top);
      tailValue = maxIDList.getInt(tail);
      maxIDList.set(top, tailValue);
      maxIDList.set(tail, topValue);
    }


    try {
      _nestedArray.load(maxdoc + 1, loader);
      _weightArray.load(maxdoc + 1, weightLoader);
    } catch (IOException e) {
      throw e;
    } catch (Exception e) {
      throw new RuntimeException("failed to load due to " + e.toString(), e);
    }


    this.valArray = list;
    this.freqs = freqList.toIntArray();
    this.minIDs = minIDList.toIntArray();
    this.maxIDs = maxIDList.toIntArray();


    int doc = 0;
    while (doc < maxdoc && !_nestedArray.contains(doc, 0, true)) {
      ++doc;
    }
    if (doc < maxdoc) {
      this.minIDs[0] = doc;
      doc = maxdoc - 1;
      while (doc >= 0 && !_nestedArray.contains(doc, 0, true)) {
        --doc;
      }
      this.maxIDs[0] = doc;
    }
    this.freqs[0] = maxdoc - (int) bitset.cardinality();
  }

View Full Code Here

0 1 2 3 4 5 6 7 8 9

TOP

Related Classes of org.apache.lucene.util.OpenBitSet

com.browseengine.bobo.facets.data.MultiValueFacetDataCache

com.browseengine.bobo.facets.data.MultiValueWithWeightFacetDataCache

com.browseengine.bobo.facets.filter.BitSetFilter

com.browseengine.bobo.facets.filter.FacetOrFilter$FacetOrRandomAccessDocIdSet

com.browseengine.bobo.facets.filter.MultiValueORFacetFilter

com.browseengine.bobo.facets.range.ValueConverterBitSetBuilder

com.esri.gpt.catalog.lucene.AclFilter

com.esri.gpt.catalog.lucene.IsPartOfFilter

com.esri.gpt.catalog.lucene.SchemaFilter

com.esri.gpt.server.assertion.index.AsnFilter

All source code are property of their respective owners. Java is a trademark of Sun Microsystems, Inc and owned by ORACLE Inc. Contact coftware#gmail.com.

Examples of org.apache.lucene.util.OpenBitSet

Performance Results

Related Classes of org.apache.lucene.util.OpenBitSet