Source Code of org.apache.lucene.codecs.lucene40.Lucene40PostingsWriter$PendingTerm

package org.apache.lucene.codecs.lucene40;


/*
 * Licensed to the Apache Software Foundation (ASF) under one or more
 * contributor license agreements.  See the NOTICE file distributed with
 * this work for additional information regarding copyright ownership.
 * The ASF licenses this file to You under the Apache License, Version 2.0
 * (the "License"); you may not use this file except in compliance with
 * the License.  You may obtain a copy of the License at
 *
 *     http://www.apache.org/licenses/LICENSE-2.0
 *
 * Unless required by applicable law or agreed to in writing, software
 * distributed under the License is distributed on an "AS IS" BASIS,
 * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 * See the License for the specific language governing permissions and
 * limitations under the License.
 */


/** Consumes doc & freq, writing them using the current
 *  index file format */


import java.io.IOException;
import java.util.ArrayList;
import java.util.List;


import org.apache.lucene.codecs.CodecUtil;
import org.apache.lucene.codecs.PostingsWriterBase;
import org.apache.lucene.codecs.TermStats;
import org.apache.lucene.index.CorruptIndexException;
import org.apache.lucene.index.DocsEnum;
import org.apache.lucene.index.FieldInfo.IndexOptions;
import org.apache.lucene.index.FieldInfo;
import org.apache.lucene.index.IndexFileNames;
import org.apache.lucene.index.SegmentWriteState;
import org.apache.lucene.store.IndexOutput;
import org.apache.lucene.store.RAMOutputStream;
import org.apache.lucene.util.BytesRef;
import org.apache.lucene.util.IOUtils;


/**
 * Concrete class that writes the 4.0 frq/prx postings format.
 * 
 * @see Lucene40PostingsFormat
 * @lucene.experimental 
 */
public final class Lucene40PostingsWriter extends PostingsWriterBase {


  final IndexOutput freqOut;
  final IndexOutput proxOut;
  final Lucene40SkipListWriter skipListWriter;
  /** Expert: The fraction of TermDocs entries stored in skip tables,
   * used to accelerate {@link DocsEnum#advance(int)}.  Larger values result in
   * smaller indexes, greater acceleration, but fewer accelerable cases, while
   * smaller values result in bigger indexes, less acceleration and more
   * accelerable cases. More detailed experiments would be useful here. */
  static final int DEFAULT_SKIP_INTERVAL = 16;
  final int skipInterval;
  
  /**
   * Expert: minimum docFreq to write any skip data at all
   */
  final int skipMinimum;


  /** Expert: The maximum number of skip levels. Smaller values result in 
   * slightly smaller indexes, but slower skipping in big posting lists.
   */
  final int maxSkipLevels = 10;
  final int totalNumDocs;
  IndexOutput termsOut;


  IndexOptions indexOptions;
  boolean storePayloads;
  boolean storeOffsets;
  // Starts a new term
  long freqStart;
  long proxStart;
  FieldInfo fieldInfo;
  int lastPayloadLength;
  int lastOffsetLength;
  int lastPosition;
  int lastOffset;


  // private String segment;


  /** Creates a {@link Lucene40PostingsWriter}, with the
   *  {@link #DEFAULT_SKIP_INTERVAL}. */
  public Lucene40PostingsWriter(SegmentWriteState state) throws IOException {
    this(state, DEFAULT_SKIP_INTERVAL);
  }
  
  /** Creates a {@link Lucene40PostingsWriter}, with the
   *  specified {@code skipInterval}. */
  public Lucene40PostingsWriter(SegmentWriteState state, int skipInterval) throws IOException {
    super();
    this.skipInterval = skipInterval;
    this.skipMinimum = skipInterval; /* set to the same for now */
    // this.segment = state.segmentName;
    String fileName = IndexFileNames.segmentFileName(state.segmentInfo.name, state.segmentSuffix, Lucene40PostingsFormat.FREQ_EXTENSION);
    freqOut = state.directory.createOutput(fileName, state.context);
    boolean success = false;
    IndexOutput proxOut = null;
    try {
      CodecUtil.writeHeader(freqOut, Lucene40PostingsReader.FRQ_CODEC, Lucene40PostingsReader.VERSION_CURRENT);
      // TODO: this is a best effort, if one of these fields has no postings
      // then we make an empty prx file, same as if we are wrapped in 
      // per-field postingsformat. maybe... we shouldn't
      // bother w/ this opto?  just create empty prx file...?
      if (state.fieldInfos.hasProx()) {
        // At least one field does not omit TF, so create the
        // prox file
        fileName = IndexFileNames.segmentFileName(state.segmentInfo.name, state.segmentSuffix, Lucene40PostingsFormat.PROX_EXTENSION);
        proxOut = state.directory.createOutput(fileName, state.context);
        CodecUtil.writeHeader(proxOut, Lucene40PostingsReader.PRX_CODEC, Lucene40PostingsReader.VERSION_CURRENT);
      } else {
        // Every field omits TF so we will write no prox file
        proxOut = null;
      }
      this.proxOut = proxOut;
      success = true;
    } finally {
      if (!success) {
        IOUtils.closeWhileHandlingException(freqOut, proxOut);
      }
    }


    totalNumDocs = state.segmentInfo.getDocCount();


    skipListWriter = new Lucene40SkipListWriter(skipInterval,
                                               maxSkipLevels,
                                               totalNumDocs,
                                               freqOut,
                                               proxOut);
  }


  @Override
  public void start(IndexOutput termsOut) throws IOException {
    this.termsOut = termsOut;
    CodecUtil.writeHeader(termsOut, Lucene40PostingsReader.TERMS_CODEC, Lucene40PostingsReader.VERSION_CURRENT);
    termsOut.writeInt(skipInterval);                // write skipInterval
    termsOut.writeInt(maxSkipLevels);               // write maxSkipLevels
    termsOut.writeInt(skipMinimum);                 // write skipMinimum
  }


  @Override
  public void startTerm() {
    freqStart = freqOut.getFilePointer();
    //if (DEBUG) System.out.println("SPW: startTerm freqOut.fp=" + freqStart);
    if (proxOut != null) {
      proxStart = proxOut.getFilePointer();
    }
    // force first payload to write its length
    lastPayloadLength = -1;
    // force first offset to write its length
    lastOffsetLength = -1;
    skipListWriter.resetSkip();
  }


  // Currently, this instance is re-used across fields, so
  // our parent calls setField whenever the field changes
  @Override
  public void setField(FieldInfo fieldInfo) {
    //System.out.println("SPW: setField");
    /*
    if (BlockTreeTermsWriter.DEBUG && fieldInfo.name.equals("id")) {
      DEBUG = true;
    } else {
      DEBUG = false;
    }
    */
    this.fieldInfo = fieldInfo;
    indexOptions = fieldInfo.getIndexOptions();
    
    storeOffsets = indexOptions.compareTo(IndexOptions.DOCS_AND_FREQS_AND_POSITIONS_AND_OFFSETS) >= 0;        
    storePayloads = fieldInfo.hasPayloads();
    //System.out.println("  set init blockFreqStart=" + freqStart);
    //System.out.println("  set init blockProxStart=" + proxStart);
  }


  int lastDocID;
  int df;
  
  @Override
  public void startDoc(int docID, int termDocFreq) throws IOException {
    // if (DEBUG) System.out.println("SPW:   startDoc seg=" + segment + " docID=" + docID + " tf=" + termDocFreq + " freqOut.fp=" + freqOut.getFilePointer());


    final int delta = docID - lastDocID;
    
    if (docID < 0 || (df > 0 && delta <= 0)) {
      throw new CorruptIndexException("docs out of order (" + docID + " <= " + lastDocID + " ) (freqOut: " + freqOut + ")");
    }


    if ((++df % skipInterval) == 0) {
      skipListWriter.setSkipData(lastDocID, storePayloads, lastPayloadLength, storeOffsets, lastOffsetLength);
      skipListWriter.bufferSkip(df);
    }


    assert docID < totalNumDocs: "docID=" + docID + " totalNumDocs=" + totalNumDocs;


    lastDocID = docID;
    if (indexOptions == IndexOptions.DOCS_ONLY) {
      freqOut.writeVInt(delta);
    } else if (1 == termDocFreq) {
      freqOut.writeVInt((delta<<1) | 1);
    } else {
      freqOut.writeVInt(delta<<1);
      freqOut.writeVInt(termDocFreq);
    }


    lastPosition = 0;
    lastOffset = 0;
  }


  /** Add a new position & payload */
  @Override
  public void addPosition(int position, BytesRef payload, int startOffset, int endOffset) throws IOException {
    //if (DEBUG) System.out.println("SPW:     addPos pos=" + position + " payload=" + (payload == null ? "null" : (payload.length + " bytes")) + " proxFP=" + proxOut.getFilePointer());
    assert indexOptions.compareTo(IndexOptions.DOCS_AND_FREQS_AND_POSITIONS) >= 0 : "invalid indexOptions: " + indexOptions;
    assert proxOut != null;


    final int delta = position - lastPosition;
    
    assert delta >= 0: "position=" + position + " lastPosition=" + lastPosition;            // not quite right (if pos=0 is repeated twice we don't catch it)


    lastPosition = position;


    int payloadLength = 0;


    if (storePayloads) {
      payloadLength = payload == null ? 0 : payload.length;


      if (payloadLength != lastPayloadLength) {
        lastPayloadLength = payloadLength;
        proxOut.writeVInt((delta<<1)|1);
        proxOut.writeVInt(payloadLength);
      } else {
        proxOut.writeVInt(delta << 1);
      }
    } else {
      proxOut.writeVInt(delta);
    }
    
    if (storeOffsets) {
      // don't use startOffset - lastEndOffset, because this creates lots of negative vints for synonyms,
      // and the numbers aren't that much smaller anyways.
      int offsetDelta = startOffset - lastOffset;
      int offsetLength = endOffset - startOffset;
      assert offsetDelta >= 0 && offsetLength >= 0 : "startOffset=" + startOffset + ",lastOffset=" + lastOffset + ",endOffset=" + endOffset;
      if (offsetLength != lastOffsetLength) {
        proxOut.writeVInt(offsetDelta << 1 | 1);
        proxOut.writeVInt(offsetLength);
      } else {
        proxOut.writeVInt(offsetDelta << 1);
      }
      lastOffset = startOffset;
      lastOffsetLength = offsetLength;
    }
    
    if (payloadLength > 0) {
      proxOut.writeBytes(payload.bytes, payload.offset, payloadLength);
    }
  }


  @Override
  public void finishDoc() {
  }


  private static class PendingTerm {
    public final long freqStart;
    public final long proxStart;
    public final long skipOffset;


    public PendingTerm(long freqStart, long proxStart, long skipOffset) {
      this.freqStart = freqStart;
      this.proxStart = proxStart;
      this.skipOffset = skipOffset;
    }
  }


  private final List<PendingTerm> pendingTerms = new ArrayList<PendingTerm>();


  /** Called when we are done adding docs to this term */
  @Override
  public void finishTerm(TermStats stats) throws IOException {


    // if (DEBUG) System.out.println("SPW: finishTerm seg=" + segment + " freqStart=" + freqStart);
    assert stats.docFreq > 0;


    // TODO: wasteful we are counting this (counting # docs
    // for this term) in two places?
    assert stats.docFreq == df;


    final long skipOffset;
    if (df >= skipMinimum) {
      skipOffset = skipListWriter.writeSkip(freqOut)-freqStart;
    } else {
      skipOffset = -1;
    }


    pendingTerms.add(new PendingTerm(freqStart, proxStart, skipOffset));


    lastDocID = 0;
    df = 0;
  }


  private final RAMOutputStream bytesWriter = new RAMOutputStream();


  @Override
  public void flushTermsBlock(int start, int count) throws IOException {
    //if (DEBUG) System.out.println("SPW: flushTermsBlock start=" + start + " count=" + count + " left=" + (pendingTerms.size()-count) + " pendingTerms.size()=" + pendingTerms.size());


    if (count == 0) {
      termsOut.writeByte((byte) 0);
      return;
    }


    assert start <= pendingTerms.size();
    assert count <= start;


    final int limit = pendingTerms.size() - start + count;
    final PendingTerm firstTerm = pendingTerms.get(limit - count);
    // First term in block is abs coded:
    bytesWriter.writeVLong(firstTerm.freqStart);


    if (firstTerm.skipOffset != -1) {
      assert firstTerm.skipOffset > 0;
      bytesWriter.writeVLong(firstTerm.skipOffset);
    }
    if (indexOptions.compareTo(IndexOptions.DOCS_AND_FREQS_AND_POSITIONS) >= 0) {
      bytesWriter.writeVLong(firstTerm.proxStart);
    }
    long lastFreqStart = firstTerm.freqStart;
    long lastProxStart = firstTerm.proxStart;
    for(int idx=limit-count+1; idx<limit; idx++) {
      final PendingTerm term = pendingTerms.get(idx);
      //if (DEBUG) System.out.println("  write term freqStart=" + term.freqStart);
      // The rest of the terms term are delta coded:
      bytesWriter.writeVLong(term.freqStart - lastFreqStart);
      lastFreqStart = term.freqStart;
      if (term.skipOffset != -1) {
        assert term.skipOffset > 0;
        bytesWriter.writeVLong(term.skipOffset);
      }
      if (indexOptions.compareTo(IndexOptions.DOCS_AND_FREQS_AND_POSITIONS) >= 0) {
        bytesWriter.writeVLong(term.proxStart - lastProxStart);
        lastProxStart = term.proxStart;
      }
    }


    termsOut.writeVInt((int) bytesWriter.getFilePointer());
    bytesWriter.writeTo(termsOut);
    bytesWriter.reset();


    // Remove the terms we just wrote:
    pendingTerms.subList(limit-count, limit).clear();
  }


  @Override
  public void close() throws IOException {
    try {
      freqOut.close();
    } finally {
      if (proxOut != null) {
        proxOut.close();
      }
    }
  }
}
Source Code of org.apache.lucene.codecs.lucene40.Lucene40PostingsWriter$PendingTerm

Related Classes of org.apache.lucene.codecs.lucene40.Lucene40PostingsWriter$PendingTerm