Package org.apache.lucene.codecs.lucene40

Source Code of org.apache.lucene.codecs.lucene40.Lucene40PostingsWriter$PendingTerm

package org.apache.lucene.codecs.lucene40;

/*
* Licensed to the Apache Software Foundation (ASF) under one or more
* contributor license agreements.  See the NOTICE file distributed with
* this work for additional information regarding copyright ownership.
* The ASF licenses this file to You under the Apache License, Version 2.0
* (the "License"); you may not use this file except in compliance with
* the License.  You may obtain a copy of the License at
*
*     http://www.apache.org/licenses/LICENSE-2.0
*
* Unless required by applicable law or agreed to in writing, software
* distributed under the License is distributed on an "AS IS" BASIS,
* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
* See the License for the specific language governing permissions and
* limitations under the License.
*/

/** Consumes doc & freq, writing them using the current
*  index file format */

import java.io.IOException;
import java.util.ArrayList;
import java.util.List;

import org.apache.lucene.codecs.CodecUtil;
import org.apache.lucene.codecs.PostingsWriterBase;
import org.apache.lucene.codecs.TermStats;
import org.apache.lucene.index.CorruptIndexException;
import org.apache.lucene.index.DocsEnum;
import org.apache.lucene.index.FieldInfo.IndexOptions;
import org.apache.lucene.index.FieldInfo;
import org.apache.lucene.index.IndexFileNames;
import org.apache.lucene.index.SegmentWriteState;
import org.apache.lucene.store.IndexOutput;
import org.apache.lucene.store.RAMOutputStream;
import org.apache.lucene.util.BytesRef;
import org.apache.lucene.util.IOUtils;

/**
* Concrete class that writes the 4.0 frq/prx postings format.
*
* @see Lucene40PostingsFormat
* @lucene.experimental
*/
public final class Lucene40PostingsWriter extends PostingsWriterBase {

  final IndexOutput freqOut;
  final IndexOutput proxOut;
  final Lucene40SkipListWriter skipListWriter;
  /** Expert: The fraction of TermDocs entries stored in skip tables,
   * used to accelerate {@link DocsEnum#advance(int)}.  Larger values result in
   * smaller indexes, greater acceleration, but fewer accelerable cases, while
   * smaller values result in bigger indexes, less acceleration and more
   * accelerable cases. More detailed experiments would be useful here. */
  static final int DEFAULT_SKIP_INTERVAL = 16;
  final int skipInterval;
 
  /**
   * Expert: minimum docFreq to write any skip data at all
   */
  final int skipMinimum;

  /** Expert: The maximum number of skip levels. Smaller values result in
   * slightly smaller indexes, but slower skipping in big posting lists.
   */
  final int maxSkipLevels = 10;
  final int totalNumDocs;
  IndexOutput termsOut;

  IndexOptions indexOptions;
  boolean storePayloads;
  boolean storeOffsets;
  // Starts a new term
  long freqStart;
  long proxStart;
  FieldInfo fieldInfo;
  int lastPayloadLength;
  int lastOffsetLength;
  int lastPosition;
  int lastOffset;

  // private String segment;

  /** Creates a {@link Lucene40PostingsWriter}, with the
   *  {@link #DEFAULT_SKIP_INTERVAL}. */
  public Lucene40PostingsWriter(SegmentWriteState state) throws IOException {
    this(state, DEFAULT_SKIP_INTERVAL);
  }
 
  /** Creates a {@link Lucene40PostingsWriter}, with the
   *  specified {@code skipInterval}. */
  public Lucene40PostingsWriter(SegmentWriteState state, int skipInterval) throws IOException {
    super();
    this.skipInterval = skipInterval;
    this.skipMinimum = skipInterval; /* set to the same for now */
    // this.segment = state.segmentName;
    String fileName = IndexFileNames.segmentFileName(state.segmentInfo.name, state.segmentSuffix, Lucene40PostingsFormat.FREQ_EXTENSION);
    freqOut = state.directory.createOutput(fileName, state.context);
    boolean success = false;
    IndexOutput proxOut = null;
    try {
      CodecUtil.writeHeader(freqOut, Lucene40PostingsReader.FRQ_CODEC, Lucene40PostingsReader.VERSION_CURRENT);
      // TODO: this is a best effort, if one of these fields has no postings
      // then we make an empty prx file, same as if we are wrapped in
      // per-field postingsformat. maybe... we shouldn't
      // bother w/ this opto?  just create empty prx file...?
      if (state.fieldInfos.hasProx()) {
        // At least one field does not omit TF, so create the
        // prox file
        fileName = IndexFileNames.segmentFileName(state.segmentInfo.name, state.segmentSuffix, Lucene40PostingsFormat.PROX_EXTENSION);
        proxOut = state.directory.createOutput(fileName, state.context);
        CodecUtil.writeHeader(proxOut, Lucene40PostingsReader.PRX_CODEC, Lucene40PostingsReader.VERSION_CURRENT);
      } else {
        // Every field omits TF so we will write no prox file
        proxOut = null;
      }
      this.proxOut = proxOut;
      success = true;
    } finally {
      if (!success) {
        IOUtils.closeWhileHandlingException(freqOut, proxOut);
      }
    }

    totalNumDocs = state.segmentInfo.getDocCount();

    skipListWriter = new Lucene40SkipListWriter(skipInterval,
                                               maxSkipLevels,
                                               totalNumDocs,
                                               freqOut,
                                               proxOut);
  }

  @Override
  public void start(IndexOutput termsOut) throws IOException {
    this.termsOut = termsOut;
    CodecUtil.writeHeader(termsOut, Lucene40PostingsReader.TERMS_CODEC, Lucene40PostingsReader.VERSION_CURRENT);
    termsOut.writeInt(skipInterval);                // write skipInterval
    termsOut.writeInt(maxSkipLevels);               // write maxSkipLevels
    termsOut.writeInt(skipMinimum);                 // write skipMinimum
  }

  @Override
  public void startTerm() {
    freqStart = freqOut.getFilePointer();
    //if (DEBUG) System.out.println("SPW: startTerm freqOut.fp=" + freqStart);
    if (proxOut != null) {
      proxStart = proxOut.getFilePointer();
    }
    // force first payload to write its length
    lastPayloadLength = -1;
    // force first offset to write its length
    lastOffsetLength = -1;
    skipListWriter.resetSkip();
  }

  // Currently, this instance is re-used across fields, so
  // our parent calls setField whenever the field changes
  @Override
  public void setField(FieldInfo fieldInfo) {
    //System.out.println("SPW: setField");
    /*
    if (BlockTreeTermsWriter.DEBUG && fieldInfo.name.equals("id")) {
      DEBUG = true;
    } else {
      DEBUG = false;
    }
    */
    this.fieldInfo = fieldInfo;
    indexOptions = fieldInfo.getIndexOptions();
   
    storeOffsets = indexOptions.compareTo(IndexOptions.DOCS_AND_FREQS_AND_POSITIONS_AND_OFFSETS) >= 0;       
    storePayloads = fieldInfo.hasPayloads();
    //System.out.println("  set init blockFreqStart=" + freqStart);
    //System.out.println("  set init blockProxStart=" + proxStart);
  }

  int lastDocID;
  int df;
 
  @Override
  public void startDoc(int docID, int termDocFreq) throws IOException {
    // if (DEBUG) System.out.println("SPW:   startDoc seg=" + segment + " docID=" + docID + " tf=" + termDocFreq + " freqOut.fp=" + freqOut.getFilePointer());

    final int delta = docID - lastDocID;
   
    if (docID < 0 || (df > 0 && delta <= 0)) {
      throw new CorruptIndexException("docs out of order (" + docID + " <= " + lastDocID + " ) (freqOut: " + freqOut + ")");
    }

    if ((++df % skipInterval) == 0) {
      skipListWriter.setSkipData(lastDocID, storePayloads, lastPayloadLength, storeOffsets, lastOffsetLength);
      skipListWriter.bufferSkip(df);
    }

    assert docID < totalNumDocs: "docID=" + docID + " totalNumDocs=" + totalNumDocs;

    lastDocID = docID;
    if (indexOptions == IndexOptions.DOCS_ONLY) {
      freqOut.writeVInt(delta);
    } else if (1 == termDocFreq) {
      freqOut.writeVInt((delta<<1) | 1);
    } else {
      freqOut.writeVInt(delta<<1);
      freqOut.writeVInt(termDocFreq);
    }

    lastPosition = 0;
    lastOffset = 0;
  }

  /** Add a new position & payload */
  @Override
  public void addPosition(int position, BytesRef payload, int startOffset, int endOffset) throws IOException {
    //if (DEBUG) System.out.println("SPW:     addPos pos=" + position + " payload=" + (payload == null ? "null" : (payload.length + " bytes")) + " proxFP=" + proxOut.getFilePointer());
    assert indexOptions.compareTo(IndexOptions.DOCS_AND_FREQS_AND_POSITIONS) >= 0 : "invalid indexOptions: " + indexOptions;
    assert proxOut != null;

    final int delta = position - lastPosition;
   
    assert delta >= 0: "position=" + position + " lastPosition=" + lastPosition;            // not quite right (if pos=0 is repeated twice we don't catch it)

    lastPosition = position;

    int payloadLength = 0;

    if (storePayloads) {
      payloadLength = payload == null ? 0 : payload.length;

      if (payloadLength != lastPayloadLength) {
        lastPayloadLength = payloadLength;
        proxOut.writeVInt((delta<<1)|1);
        proxOut.writeVInt(payloadLength);
      } else {
        proxOut.writeVInt(delta << 1);
      }
    } else {
      proxOut.writeVInt(delta);
    }
   
    if (storeOffsets) {
      // don't use startOffset - lastEndOffset, because this creates lots of negative vints for synonyms,
      // and the numbers aren't that much smaller anyways.
      int offsetDelta = startOffset - lastOffset;
      int offsetLength = endOffset - startOffset;
      assert offsetDelta >= 0 && offsetLength >= 0 : "startOffset=" + startOffset + ",lastOffset=" + lastOffset + ",endOffset=" + endOffset;
      if (offsetLength != lastOffsetLength) {
        proxOut.writeVInt(offsetDelta << 1 | 1);
        proxOut.writeVInt(offsetLength);
      } else {
        proxOut.writeVInt(offsetDelta << 1);
      }
      lastOffset = startOffset;
      lastOffsetLength = offsetLength;
    }
   
    if (payloadLength > 0) {
      proxOut.writeBytes(payload.bytes, payload.offset, payloadLength);
    }
  }

  @Override
  public void finishDoc() {
  }

  private static class PendingTerm {
    public final long freqStart;
    public final long proxStart;
    public final long skipOffset;

    public PendingTerm(long freqStart, long proxStart, long skipOffset) {
      this.freqStart = freqStart;
      this.proxStart = proxStart;
      this.skipOffset = skipOffset;
    }
  }

  private final List<PendingTerm> pendingTerms = new ArrayList<PendingTerm>();

  /** Called when we are done adding docs to this term */
  @Override
  public void finishTerm(TermStats stats) throws IOException {

    // if (DEBUG) System.out.println("SPW: finishTerm seg=" + segment + " freqStart=" + freqStart);
    assert stats.docFreq > 0;

    // TODO: wasteful we are counting this (counting # docs
    // for this term) in two places?
    assert stats.docFreq == df;

    final long skipOffset;
    if (df >= skipMinimum) {
      skipOffset = skipListWriter.writeSkip(freqOut)-freqStart;
    } else {
      skipOffset = -1;
    }

    pendingTerms.add(new PendingTerm(freqStart, proxStart, skipOffset));

    lastDocID = 0;
    df = 0;
  }

  private final RAMOutputStream bytesWriter = new RAMOutputStream();

  @Override
  public void flushTermsBlock(int start, int count) throws IOException {
    //if (DEBUG) System.out.println("SPW: flushTermsBlock start=" + start + " count=" + count + " left=" + (pendingTerms.size()-count) + " pendingTerms.size()=" + pendingTerms.size());

    if (count == 0) {
      termsOut.writeByte((byte) 0);
      return;
    }

    assert start <= pendingTerms.size();
    assert count <= start;

    final int limit = pendingTerms.size() - start + count;
    final PendingTerm firstTerm = pendingTerms.get(limit - count);
    // First term in block is abs coded:
    bytesWriter.writeVLong(firstTerm.freqStart);

    if (firstTerm.skipOffset != -1) {
      assert firstTerm.skipOffset > 0;
      bytesWriter.writeVLong(firstTerm.skipOffset);
    }
    if (indexOptions.compareTo(IndexOptions.DOCS_AND_FREQS_AND_POSITIONS) >= 0) {
      bytesWriter.writeVLong(firstTerm.proxStart);
    }
    long lastFreqStart = firstTerm.freqStart;
    long lastProxStart = firstTerm.proxStart;
    for(int idx=limit-count+1; idx<limit; idx++) {
      final PendingTerm term = pendingTerms.get(idx);
      //if (DEBUG) System.out.println("  write term freqStart=" + term.freqStart);
      // The rest of the terms term are delta coded:
      bytesWriter.writeVLong(term.freqStart - lastFreqStart);
      lastFreqStart = term.freqStart;
      if (term.skipOffset != -1) {
        assert term.skipOffset > 0;
        bytesWriter.writeVLong(term.skipOffset);
      }
      if (indexOptions.compareTo(IndexOptions.DOCS_AND_FREQS_AND_POSITIONS) >= 0) {
        bytesWriter.writeVLong(term.proxStart - lastProxStart);
        lastProxStart = term.proxStart;
      }
    }

    termsOut.writeVInt((int) bytesWriter.getFilePointer());
    bytesWriter.writeTo(termsOut);
    bytesWriter.reset();

    // Remove the terms we just wrote:
    pendingTerms.subList(limit-count, limit).clear();
  }

  @Override
  public void close() throws IOException {
    try {
      freqOut.close();
    } finally {
      if (proxOut != null) {
        proxOut.close();
      }
    }
  }
}
TOP

Related Classes of org.apache.lucene.codecs.lucene40.Lucene40PostingsWriter$PendingTerm

TOP
Copyright © 2018 www.massapi.com. All rights reserved.
All source code are property of their respective owners. Java is a trademark of Sun Microsystems, Inc and owned by ORACLE Inc. Contact coftware#gmail.com.