Package org.apache.lucene.index

Examples of org.apache.lucene.index.DocsAndPositionsEnum


      hasOffsets = fieldInfo.getIndexOptions().compareTo(IndexOptions.DOCS_AND_FREQS_AND_POSITIONS) > 0;
      hasPayloads = fieldInfo.hasPayloads();

      BytesRef term;
      DocsEnum docsEnum = null;
      DocsAndPositionsEnum docsAndPositionsEnum = null;
      final TermsEnum termsEnum = termsIn.iterator(null);
      int termOffset = 0;

      final IntArrayWriter scratch = new IntArrayWriter();

      // Used for payloads, if any:
      final RAMOutputStream ros = new RAMOutputStream();

      // if (DEBUG) {
      //   System.out.println("\nLOAD terms seg=" + state.segmentInfo.name + " field=" + field + " hasOffsets=" + hasOffsets + " hasFreq=" + hasFreq + " hasPos=" + hasPos + " hasPayloads=" + hasPayloads);
      // }

      while ((term = termsEnum.next()) != null) {
        final int docFreq = termsEnum.docFreq();
        final long totalTermFreq = termsEnum.totalTermFreq();

        // if (DEBUG) {
        //   System.out.println("  term=" + term.utf8ToString());
        // }

        termOffsets[count] = termOffset;

        if (termBytes.length < (termOffset + term.length)) {
          termBytes = ArrayUtil.grow(termBytes, termOffset + term.length);
        }
        System.arraycopy(term.bytes, term.offset, termBytes, termOffset, term.length);
        termOffset += term.length;
        termOffsets[count+1] = termOffset;

        if (hasPos) {
          docsAndPositionsEnum = termsEnum.docsAndPositions(null, docsAndPositionsEnum);
        } else {
          docsEnum = termsEnum.docs(null, docsEnum);
        }

        final TermAndSkip ent;

        final DocsEnum docsEnum2;
        if (hasPos) {
          docsEnum2 = docsAndPositionsEnum;
        } else {
          docsEnum2 = docsEnum;
        }

        int docID;

        if (docFreq <= lowFreqCutoff) {

          ros.reset();

          // Pack postings for low-freq terms into a single int[]:
          while ((docID = docsEnum2.nextDoc()) != DocsEnum.NO_MORE_DOCS) {
            scratch.add(docID);
            if (hasFreq) {
              final int freq = docsEnum2.freq();
              scratch.add(freq);
              if (hasPos) {
                for(int pos=0;pos<freq;pos++) {
                  scratch.add(docsAndPositionsEnum.nextPosition());
                  if (hasOffsets) {
                    scratch.add(docsAndPositionsEnum.startOffset());
                    scratch.add(docsAndPositionsEnum.endOffset());
                  }
                  if (hasPayloads) {
                    final BytesRef payload = docsAndPositionsEnum.getPayload();
                    if (payload != null) {
                      scratch.add(payload.length);
                      ros.writeBytes(payload.bytes, payload.offset, payload.length);
                    } else {
                      scratch.add(0);
                    }
                  }
                }
              }
            }
          }

          final byte[] payloads;
          if (hasPayloads) {
            payloads = new byte[(int) ros.getFilePointer()];
            ros.writeTo(payloads, 0);
          } else {
            payloads = null;
          }

          final int[] postings = scratch.get();
       
          ent = new LowFreqTerm(postings, payloads, docFreq, (int) totalTermFreq);
        } else {
          final int[] docs = new int[docFreq];
          final int[] freqs;
          final int[][] positions;
          final byte[][][] payloads;
          if (hasFreq) {
            freqs = new int[docFreq];
            if (hasPos) {
              positions = new int[docFreq][];
              if (hasPayloads) {
                payloads = new byte[docFreq][][];
              } else {
                payloads = null;
              }
            } else {
              positions = null;
              payloads = null;
            }
          } else {
            freqs = null;
            positions = null;
            payloads = null;
          }

          // Use separate int[] for the postings for high-freq
          // terms:
          int upto = 0;
          while ((docID = docsEnum2.nextDoc()) != DocsEnum.NO_MORE_DOCS) {
            docs[upto] = docID;
            if (hasFreq) {
              final int freq = docsEnum2.freq();
              freqs[upto] = freq;
              if (hasPos) {
                final int mult;
                if (hasOffsets) {
                  mult = 3;
                } else {
                  mult = 1;
                }
                if (hasPayloads) {
                  payloads[upto] = new byte[freq][];
                }
                positions[upto] = new int[mult*freq];
                int posUpto = 0;
                for(int pos=0;pos<freq;pos++) {
                  positions[upto][posUpto] = docsAndPositionsEnum.nextPosition();
                  if (hasPayloads) {
                    BytesRef payload = docsAndPositionsEnum.getPayload();
                    if (payload != null) {
                      byte[] payloadBytes = new byte[payload.length];
                      System.arraycopy(payload.bytes, payload.offset, payloadBytes, 0, payload.length);
                      payloads[upto][pos] = payloadBytes;
                    }
                  }
                  posUpto++;
                  if (hasOffsets) {
                    positions[upto][posUpto++] = docsAndPositionsEnum.startOffset();
                    positions[upto][posUpto++] = docsAndPositionsEnum.endOffset();
                  }
                }
              }
            }

View Full Code Here


 
  @Test
  public void testDocsAndPositionsEnum() throws Exception {
    TermsEnum termsEnum = reader.terms(DOC_POSITIONS_FIELD).iterator(null);
    assertEquals(SeekStatus.FOUND, termsEnum.seekCeil(new BytesRef(DOC_POSITIONS_TERM)));
    DocsAndPositionsEnum sortedPositions = termsEnum.docsAndPositions(null, null);
    int doc;
   
    // test nextDoc()
    while ((doc = sortedPositions.nextDoc()) != DocIdSetIterator.NO_MORE_DOCS) {
      int freq = sortedPositions.freq();
      assertEquals("incorrect freq for doc=" + doc, sortedValues[doc].intValue() / 10 + 1, freq);
      for (int i = 0; i < freq; i++) {
        assertEquals("incorrect position for doc=" + doc, i, sortedPositions.nextPosition());
        if (!doesntSupportOffsets.contains(TestUtil.getPostingsFormat(DOC_POSITIONS_FIELD))) {
          assertEquals("incorrect startOffset for doc=" + doc, i, sortedPositions.startOffset());
          assertEquals("incorrect endOffset for doc=" + doc, i, sortedPositions.endOffset());
        }
        assertEquals("incorrect payload for doc=" + doc, freq - i, Integer.parseInt(sortedPositions.getPayload().utf8ToString()));
      }
    }
   
    // test advance()
    final DocsAndPositionsEnum reuse = sortedPositions;
    sortedPositions = termsEnum.docsAndPositions(null, reuse);
    if (sortedPositions instanceof SortingDocsAndPositionsEnum) {
      assertTrue(((SortingDocsAndPositionsEnum) sortedPositions).reused(reuse)); // make sure reuse worked
    }
    doc = 0;
View Full Code Here

      String docTerm = term.utf8ToString();

      if (matchedTokens.contains(docTerm) || docTerm.startsWith(prefixToken)) {

        DocsAndPositionsEnum docPosEnum = it.docsAndPositions(null, null, DocsAndPositionsEnum.FLAG_OFFSETS);
        docPosEnum.nextDoc();

        // use the first occurrence of the term
        int p = docPosEnum.nextPosition();
        if (p < position) {
          position = p;
        }
      }
    }
View Full Code Here

      Terms tvs = tvFields.terms(field);
      assertNotNull(tvs);
      assertEquals(2, tvs.size());
      TermsEnum tvsEnum = tvs.iterator(null);
      assertEquals(new BytesRef("abc"), tvsEnum.next());
      final DocsAndPositionsEnum dpEnum = tvsEnum.docsAndPositions(null, null);
      if (field.equals("tv")) {
        assertNull(dpEnum);
      } else {
        assertNotNull(dpEnum);
      }
View Full Code Here

    Terms vector = r.getTermVectors(0).terms("field");
    assertEquals(1, vector.size());
    TermsEnum termsEnum = vector.iterator(null);
    termsEnum.next();
    assertEquals(2, termsEnum.totalTermFreq());
    DocsAndPositionsEnum positions = termsEnum.docsAndPositions(null, null);
    assertTrue(positions.nextDoc() != DocIdSetIterator.NO_MORE_DOCS);
    assertEquals(2, positions.freq());
    positions.nextPosition();
    assertEquals(0, positions.startOffset());
    assertEquals(4, positions.endOffset());
    positions.nextPosition();
    assertEquals(8, positions.startOffset());
    assertEquals(12, positions.endOffset());
    assertEquals(DocIdSetIterator.NO_MORE_DOCS, positions.nextDoc());
    r.close();
    dir.close();
  }
View Full Code Here

    assertEquals(1, reader.docFreq(new Term("content", "term")));
    assertEquals(1, reader.docFreq(new Term("content", "another")));

    // Make sure position is still incremented when
    // massive term is skipped:
    DocsAndPositionsEnum tps = MultiFields.getTermPositionsEnum(reader,
                                                                MultiFields.getLiveDocs(reader),
                                                                "content",
                                                                new BytesRef("another"));
    assertTrue(tps.nextDoc() != DocIdSetIterator.NO_MORE_DOCS);
    assertEquals(1, tps.freq());
    assertEquals(3, tps.nextPosition());

    // Make sure the doc that has the massive term is in
    // the index:
    assertEquals("document with wicked long term should is not in the index!", 2, reader.numDocs());
View Full Code Here

          case 1: posFlags = DocsAndPositionsEnum.FLAG_OFFSETS; break;
          case 2: posFlags = DocsAndPositionsEnum.FLAG_PAYLOADS; break;
          default: posFlags = DocsAndPositionsEnum.FLAG_OFFSETS | DocsAndPositionsEnum.FLAG_PAYLOADS; break;
        }
        // TODO: cast to DocsAndPositionsEnum?
        DocsAndPositionsEnum docsAndPositions = termsEnum.docsAndPositions(liveDocs, null, posFlags);
        if (docsAndPositions != null) {
          return docsAndPositions;
        }
      }
      flags |= DocsEnum.FLAG_FREQS;
View Full Code Here

    }
   
    // it's ok to use MultiFields because we only iterate on one posting list.
    // breaking it to loop over the leaves() only complicates code for no
    // apparent gain.
    DocsAndPositionsEnum positions = MultiFields.getTermPositionsEnum(reader, null,
        Consts.FIELD_PAYLOADS, Consts.PAYLOAD_PARENT_BYTES_REF,
        DocsAndPositionsEnum.FLAG_PAYLOADS);

    // shouldn't really happen, if it does, something's wrong
    if (positions == null || positions.advance(first) == DocIdSetIterator.NO_MORE_DOCS) {
      throw new CorruptIndexException("Missing parent data for category " + first);
    }
   
    int num = reader.maxDoc();
    for (int i = first; i < num; i++) {
      if (positions.docID() == i) {
        if (positions.freq() == 0) { // shouldn't happen
          throw new CorruptIndexException("Missing parent data for category " + i);
        }
       
        parents[i] = positions.nextPosition();
       
        if (positions.nextDoc() == DocIdSetIterator.NO_MORE_DOCS) {
          if (i + 1 < num) {
            throw new CorruptIndexException("Missing parent data for category "+ (i + 1));
          }
          break;
        }
View Full Code Here

      return new SortingDocsEnum(docMap.size(), wrapReuse, inDocs, withFreqs, docMap);
    }

    @Override
    public DocsAndPositionsEnum docsAndPositions(Bits liveDocs, DocsAndPositionsEnum reuse, final int flags) throws IOException {
      final DocsAndPositionsEnum inReuse;
      final SortingDocsAndPositionsEnum wrapReuse;
      if (reuse != null && reuse instanceof SortingDocsAndPositionsEnum) {
        // if we're asked to reuse the given DocsEnum and it is Sorting, return
        // the wrapped one, since some Codecs expect it.
        wrapReuse = (SortingDocsAndPositionsEnum) reuse;
        inReuse = wrapReuse.getWrapped();
      } else {
        wrapReuse = null;
        inReuse = reuse;
      }

      final DocsAndPositionsEnum inDocsAndPositions = in.docsAndPositions(newToOld(liveDocs), inReuse, flags);
      if (inDocsAndPositions == null) {
        return null;
      }

      // we ignore the fact that offsets may be stored but not asked for,
View Full Code Here

   
  private Map<Integer,String> highlightField(String field, String contents[], BreakIterator bi, BytesRef terms[], int[] docids, List<AtomicReaderContext> leaves, int maxPassages) throws IOException
    Map<Integer,String> highlights = new HashMap<Integer,String>();
   
    // reuse in the real sense... for docs in same segment we just advance our old enum
    DocsAndPositionsEnum postings[] = null;
    TermsEnum termsEnum = null;
    int lastLeaf = -1;

    PassageFormatter fieldFormatter = getFormatter(field);
    if (fieldFormatter == null) {
View Full Code Here

TOP

Related Classes of org.apache.lucene.index.DocsAndPositionsEnum

Copyright © 2018 www.massapicom. All rights reserved.
All source code are property of their respective owners. Java is a trademark of Sun Microsystems, Inc and owned by ORACLE Inc. Contact coftware#gmail.com.